{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,16]],"date-time":"2026-02-16T17:06:14Z","timestamp":1771261574307,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":87,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680810","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"7316-7325","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["Hypergraph Multi-modal Large Language Model: Exploiting EEG and Eye-tracking Modalities to Evaluate Heterogeneous Responses for Video Understanding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-4577-3002","authenticated-orcid":false,"given":"Minghui","family":"Wu","sequence":"first","affiliation":[{"name":"Mininglamp Technology &amp; Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4044-5701","authenticated-orcid":false,"given":"Chenxu","family":"Zhao","sequence":"additional","affiliation":[{"name":"Mininglamp Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0085-3527","authenticated-orcid":false,"given":"Anyang","family":"Su","sequence":"additional","affiliation":[{"name":"Mininglamp Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2270-3378","authenticated-orcid":false,"given":"Donglin","family":"Di","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Lab, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7656-350X","authenticated-orcid":false,"given":"Tianyu","family":"Fu","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Lab, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-8902-6422","authenticated-orcid":false,"given":"Da","family":"An","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Lab, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5795-3885","authenticated-orcid":false,"given":"Min","family":"He","sequence":"additional","affiliation":[{"name":"MiningLamp Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9785-922X","authenticated-orcid":false,"given":"Ya","family":"Gao","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1963-2513","authenticated-orcid":false,"given":"Meng","family":"Ma","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1234-6119","authenticated-orcid":false,"given":"Kun","family":"Yan","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8854-2079","authenticated-orcid":false,"given":"Ping","family":"Wang","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_2_2_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems Vol. 35 (2022) 23716--23736."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"e_1_3_2_2_4_1","volume-title":"Hypersage: Generalizing inductive representation learning on hypergraphs. arXiv preprint arXiv:2010.04558","author":"Arya Devanshu","year":"2020","unstructured":"Devanshu Arya, Deepak K Gupta, Stevan Rudinac, and Marcel Worring. 2020. Hypersage: Generalizing inductive representation learning on hypergraphs. arXiv preprint arXiv:2010.04558 (2020)."},{"key":"e_1_3_2_2_5_1","volume-title":"CEUR Workshop Proceedings.","author":"Azcona David","year":"2020","unstructured":"David Azcona, Enric Moreu, Feiyan Hu, Tom\u00e1s E Ward, and Alan F Smeaton. 2020. Predicting media memorability using ensemble models. CEUR Workshop Proceedings."},{"key":"e_1_3_2_2_6_1","volume-title":"Hypergraphs: combinatorics of finite sets","author":"Berge Claude","unstructured":"Claude Berge. 1984. Hypergraphs: combinatorics of finite sets. Vol. 45. Elsevier."},{"key":"e_1_3_2_2_7_1","first-page":"4","article-title":"Is space-time attention all you need for video understanding?","volume":"2","author":"Bertasius Gedas","year":"2021","unstructured":"Gedas Bertasius, Heng Wang, and Lorenzo Torresani. 2021. Is space-time attention all you need for video understanding?. In ICML, Vol. 2. 4.","journal-title":"ICML"},{"key":"e_1_3_2_2_8_1","volume-title":"Neuronal oscillations in cortical networks. science","author":"Buzsaki Gyorgy","year":"2004","unstructured":"Gyorgy Buzsaki and Andreas Draguhn. 2004. Neuronal oscillations in cortical networks. science, Vol. 304, 5679 (2004), 1926--1929."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/THMS.2023.3275626"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_2_2_12_1","volume-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. See https:\/\/vicuna. lmsys. org (accessed","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E Gonzalez, et al. 2023. Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. See https:\/\/vicuna. lmsys. org (accessed 14 April 2023), Vol. 2, 3 (2023), 6."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00262"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3301299"},{"key":"e_1_3_2_2_15_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Li, Pascale N Fung, and Steven Hoi.","author":"Dai Wenliang","year":"2024","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale N Fung, and Steven Hoi. 2024. Instructblip: Towards general-purpose vision-language models with instruction tuning. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_2_16_1","volume-title":"Proceedings, Part V 16","author":"Diba Ali","year":"2020","unstructured":"Ali Diba, Mohsen Fayyaz, Vivek Sharma, Manohar Paluri, J\u00fcrgen Gall, Rainer Stiefelhagen, and Luc Van Gool. 2020. Large scale holistic video understanding. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part V 16. Springer, 593--610."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"e_1_3_2_2_18_1","volume-title":"Proceedings, Part III 14","author":"Escorcia Victor","year":"2016","unstructured":"Victor Escorcia, Fabian Caba Heilbron, Juan Carlos Niebles, and Bernard Ghanem. 2016. Daps: Deep action proposals for action understanding. In Computer Vision--ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11--14, 2016, Proceedings, Part III 14. Springer, 768--784."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.213"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33013558"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2012.2199502"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2012.2202676"},{"key":"e_1_3_2_2_24_1","first-page":"2548","article-title":"Hypergraph learning: Methods and practices","volume":"44","author":"Gao Yue","year":"2020","unstructured":"Yue Gao, Zizhao Zhang, Haojie Lin, Xibin Zhao, Shaoyi Du, and Changqing Zou. 2020. Hypergraph learning: Methods and practices. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 44, 5 (2020), 2548--2566.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"e_1_3_2_2_26_1","volume-title":"Unignn: a unified framework for graph and hypergraph neural networks. arXiv preprint arXiv:2105.00956","author":"Huang Jing","year":"2021","unstructured":"Jing Huang and Jie Yang. 2021. Unignn: a unified framework for graph and hypergraph neural networks. arXiv preprint arXiv:2105.00956 (2021)."},{"key":"e_1_3_2_2_27_1","volume-title":"Barun Patra, et al.","author":"Huang Shaohan","year":"2024","unstructured":"Shaohan Huang, Li Dong, Wenhui Wang, Yaru Hao, Saksham Singhal, Shuming Ma, Tengchao Lv, Lei Cui, Owais Khan Mohammed, Barun Patra, et al. 2024. Language is not all you need: Aligning perception with language models. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298638"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.23919\/cje.2021.00.196"},{"key":"e_1_3_2_2_30_1","volume-title":"LoRA: Low-Rank Adaptation of Large Language Models. arXiv: Computation and Language,arXiv: Computation and Language (Jun","author":"HuEdward","year":"2021","unstructured":"HuEdward J., Yulong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, and Weizhu Chen. 2021. LoRA: Low-Rank Adaptation of Large Language Models. arXiv: Computation and Language,arXiv: Computation and Language (Jun 2021)."},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2017.149"},{"key":"e_1_3_2_2_32_1","volume-title":"International conference on machine learning. PMLR, 4904--4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International conference on machine learning. PMLR, 4904--4916."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2019.03.012"},{"key":"e_1_3_2_2_34_1","volume-title":"HICSS 2019 symposium on cybersecurity big data analytics.","author":"Joslyn Cliff","year":"2019","unstructured":"Cliff Joslyn, Sinan Aksoy, Dustin Arendt, Louis Jenkins, Brenda Praggastis, Emilie Purvine, and Marcin Zalewski. 2019. High performance hypergraph analytics of domain name system relationships. In HICSS 2019 symposium on cybersecurity big data analytics."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2014.06.028"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"crossref","unstructured":"Chamandeep Kaur Preeti Singh et al. 2015. EEG derived neuronal dynamics during meditation: Progress and challenges. Advances in preventive medicine Vol. 2015 (2015).","DOI":"10.1155\/2015\/614723"},{"key":"e_1_3_2_2_37_1","volume-title":"Changyou Chen, et al.","author":"Khandelwal Ashmit","year":"2023","unstructured":"Ashmit Khandelwal, Aditya Agrawal, Aanisha Bhattacharyya, Yaman K Singla, Somesh Singh, Uttaran Bhattacharya, Ishita Dasgupta, Stefano Petrangeli, Rajiv Ratn Shah, Changyou Chen, et al. 2023. Large content and behavior models to understand, simulate, and optimize content and behavior. arXiv preprint arXiv:2309.00359 (2023)."},{"key":"e_1_3_2_2_38_1","volume-title":"EEG alpha and theta oscillations reflect cognitive and memory performance: a review and analysis. Brain research reviews","author":"Klimesch Wolfgang","year":"1999","unstructured":"Wolfgang Klimesch. 1999. EEG alpha and theta oscillations reflect cognitive and memory performance: a review and analysis. Brain research reviews, Vol. 29, 2--3 (1999), 169--195."},{"key":"e_1_3_2_2_39_1","volume-title":"Grounding language models to images for multimodal generation. arXiv preprint arXiv:2301.13823","author":"Koh Jing Yu","year":"2023","unstructured":"Jing Yu Koh, Ruslan Salakhutdinov, and Daniel Fried. 2023. Grounding language models to images for multimodal generation. arXiv preprint arXiv:2301.13823, Vol. 2 (2023)."},{"key":"e_1_3_2_2_40_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730--19742."},{"key":"e_1_3_2_2_41_1","volume-title":"Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355","author":"Li KunChang","year":"2023","unstructured":"KunChang Li, Yinan He, Yi Wang, Yizhuo Li, Wenhai Wang, Ping Luo, Yali Wang, Limin Wang, and Yu Qiao. 2023. Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355 (2023)."},{"key":"e_1_3_2_2_42_1","unstructured":"Kunchang Li Yali Wang Yinan He Yizhuo Li Yi Wang Yi Liu Zun Wang Jilan Xu Guo Chen Ping Luo Limin Wang and Yu Qiao. 2023. MVBench: A Comprehensive Multi-modal Video Understanding Benchmark. arxiv: 2311.17005 [cs.CV]"},{"key":"e_1_3_2_2_43_1","volume-title":"2023 d","author":"Li Kunchang","unstructured":"Kunchang Li, Yali Wang, Yizhuo Li, Yi Wang, Yinan He, Limin Wang, and Yu Qiao. 2023 d. Unmasked Teacher: Towards Training-Efficient Video Foundation Models. arxiv: 2303.16058 [cs.CV]"},{"key":"e_1_3_2_2_44_1","volume-title":"2023 e. Towards robust multimodal sentiment analysis under uncertain signal missing","author":"Li Mingcheng","year":"2023","unstructured":"Mingcheng Li, Dingkang Yang, and Lihua Zhang. 2023 e. Towards robust multimodal sentiment analysis under uncertain signal missing. IEEE Signal Processing Letters (2023)."},{"key":"e_1_3_2_2_45_1","volume-title":"Proceedings, Part XXX 16","author":"Li Xiujun","year":"2020","unstructured":"Xiujun Li, Xi Yin, Chunyuan Li, Pengchuan Zhang, Xiaowei Hu, Lei Zhang, Lijuan Wang, Houdong Hu, Li Dong, Furu Wei, et al. 2020. Oscar: Object-semantics aligned pre-training for vision-language tasks. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XXX 16. Springer, 121--137."},{"key":"e_1_3_2_2_46_1","volume-title":"Supervision exists everywhere: A data efficient contrastive language-image pre-training paradigm. arXiv preprint arXiv:2110.05208","author":"Li Yangguang","year":"2021","unstructured":"Yangguang Li, Feng Liang, Lichen Zhao, Yufeng Cui, Wanli Ouyang, Jing Shao, Fengwei Yu, and Junjie Yan. 2021. Supervision exists everywhere: A data efficient contrastive language-image pre-training paradigm. arXiv preprint arXiv:2110.05208 (2021)."},{"key":"e_1_3_2_2_47_1","volume-title":"Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122","author":"Lin Bin","year":"2023","unstructured":"Bin Lin, Bin Zhu, Yang Ye, Munan Ning, Peng Jin, and Li Yuan. 2023. Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122 (2023)."},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1049\/cje.2021.00.108"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123343"},{"key":"e_1_3_2_2_50_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2016.2621671"},{"key":"e_1_3_2_2_52_1","volume-title":"Networks of spiking neurons: the third generation of neural network models. Neural networks","author":"Maass Wolfgang","year":"1997","unstructured":"Wolfgang Maass. 1997. Networks of spiking neurons: the third generation of neural network models. Neural networks, Vol. 10, 9 (1997), 1659--1671."},{"key":"e_1_3_2_2_53_1","volume-title":"Video-ChatGPT: Towards Detailed Video Understanding via Large Vision and Language Models. arXiv:2306.05424","author":"Maaz Muhammad","year":"2023","unstructured":"Muhammad Maaz, Hanoona Rasheed, Salman Khan, and Fahad Shahbaz Khan. 2023. Video-ChatGPT: Towards Detailed Video Understanding via Large Vision and Language Models. arXiv:2306.05424 (2023)."},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00355"},{"key":"e_1_3_2_2_55_1","volume-title":"Proceedings, Part XVI 16","author":"Newman Anelise","year":"2020","unstructured":"Anelise Newman, Camilo Fosco, Vincent Casser, Allen Lee, Barry McNamara, and Aude Oliva. 2020. Multimodal memorability: Modeling effects of semantics and decay on video memorability. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XVI 16. Springer, 223--240."},{"key":"e_1_3_2_2_56_1","unstructured":"Long Ouyang Jeffrey Wu Xu Jiang Diogo Almeida Carroll Wainwright Pamela Mishkin Chong Zhang Sandhini Agarwal Katarina Slama Alex Ray et al. 2022. Training language models to follow instructions with human feedback. Advances in neural information processing systems Vol. 35 (2022) 27730--27744."},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548018"},{"key":"e_1_3_2_2_58_1","volume-title":"The brain, emotion, and depression","author":"Rolls Edmund T","unstructured":"Edmund T Rolls. 2018. The brain, emotion, and depression. Oxford University Press."},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.1088\/1741-2552\/ab260c"},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1037\/h0077714"},{"key":"e_1_3_2_2_61_1","volume-title":"Two-stream convolutional networks for action recognition in videos. Advances in neural information processing systems","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Two-stream convolutional networks for action recognition in videos. Advances in neural information processing systems, Vol. 27 (2014)."},{"key":"e_1_3_2_2_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2018.2817622"},{"key":"e_1_3_2_2_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CAC51589.2020.9326323"},{"key":"e_1_3_2_2_64_1","volume-title":"Pandagpt: One model to instruction-follow them all. arXiv preprint arXiv:2305.16355","author":"Su Yixuan","year":"2023","unstructured":"Yixuan Su, Tian Lan, Huayang Li, Jialu Xu, Yan Wang, and Deng Cai. 2023. Pandagpt: One model to instruction-follow them all. arXiv preprint arXiv:2305.16355 (2023)."},{"key":"e_1_3_2_2_65_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.bspc.2021.102648"},{"key":"e_1_3_2_2_66_1","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Yonghui Wu Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew M Dai Anja Hauth et al. 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)."},{"key":"e_1_3_2_2_67_1","doi-asserted-by":"publisher","DOI":"10.2307\/3611062"},{"key":"e_1_3_2_2_68_1","volume-title":"Git: A generative image-to-text transformer for vision and language. arXiv preprint arXiv:2205.14100","author":"Wang Jianfeng","year":"2022","unstructured":"Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, and Lijuan Wang. 2022. Git: A generative image-to-text transformer for vision and language. arXiv preprint arXiv:2205.14100 (2022)."},{"key":"e_1_3_2_2_69_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"e_1_3_2_2_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2015.2415497"},{"key":"e_1_3_2_2_71_1","doi-asserted-by":"publisher","DOI":"10.1088\/1757-899X\/569\/3\/032035"},{"key":"e_1_3_2_2_72_1","unstructured":"Jason Wei Yi Tay Rishi Bommasani Colin Raffel Barret Zoph Sebastian Borgeaud Dani Yogatama Maarten Bosma Denny Zhou Donald Metzler et al. 2022. Emergent abilities of large language models. arXiv preprint arXiv:2206.07682 (2022)."},{"key":"e_1_3_2_2_73_1","volume-title":"Next-gpt: Any-to-any multimodal llm. arXiv preprint arXiv:2309.05519","author":"Wu Shengqiong","year":"2023","unstructured":"Shengqiong Wu, Hao Fei, Leigang Qu, Wei Ji, and Tat-Seng Chua. 2023. Next-gpt: Any-to-any multimodal llm. arXiv preprint arXiv:2309.05519 (2023)."},{"key":"e_1_3_2_2_74_1","doi-asserted-by":"publisher","DOI":"10.1088\/1741-2552\/ac49a7"},{"key":"e_1_3_2_2_75_1","unstructured":"Dejing Xu Zhou Zhao Jun Xiao Fei Wu Hanwang Zhang Xiangnan He and Yueting Zhuang. 2017. Video Question Answering via Gradually Refined Attention over Appearance and Motion. In ACM Multimedia."},{"key":"e_1_3_2_2_76_1","volume-title":"Hypergcn: A new method for training graph convolutional networks on hypergraphs. Advances in neural information processing systems","author":"Yadati Naganand","year":"2019","unstructured":"Naganand Yadati, Madhav Nimishakavi, Prateek Yadav, Vikram Nitin, Anand Louis, and Partha Talukdar. 2019. Hypergcn: A new method for training graph convolutional networks on hypergraphs. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_2_77_1","volume-title":"Videogpt: Video generation using vq-vae and transformers. arXiv preprint arXiv:2104.10157","author":"Yan Wilson","year":"2021","unstructured":"Wilson Yan, Yunzhi Zhang, Pieter Abbeel, and Aravind Srinivas. 2021. Videogpt: Video generation using vq-vae and transformers. arXiv preprint arXiv:2104.10157 (2021)."},{"key":"e_1_3_2_2_78_1","unstructured":"Qinghao Ye Haiyang Xu Guohai Xu Jiabo Ye Ming Yan Yiyang Zhou Junyang Wang Anwen Hu Pengcheng Shi Yaya Shi et al. 2023. mplug-owl: Modularization empowers large language models with multimodality. arXiv preprint arXiv:2304.14178 (2023)."},{"key":"e_1_3_2_2_79_1","doi-asserted-by":"crossref","unstructured":"Zhou Yu Dejing Xu Jun Yu Ting Yu Zhou Zhao Yueting Zhuang and Dacheng Tao. 2019. ActivityNet-QA: A Dataset for Understanding Complex Web Videos via Question Answering. In AAAI. 9127--9134.","DOI":"10.1609\/aaai.v33i01.33019127"},{"key":"e_1_3_2_2_80_1","volume-title":"Speechgpt: Empowering large language models with intrinsic cross-modal conversational abilities. arXiv preprint arXiv:2305.11000","author":"Zhang Dong","year":"2023","unstructured":"Dong Zhang, Shimin Li, Xin Zhang, Jun Zhan, Pengyu Wang, Yaqian Zhou, and Xipeng Qiu. 2023. Speechgpt: Empowering large language models with intrinsic cross-modal conversational abilities. arXiv preprint arXiv:2305.11000 (2023)."},{"key":"e_1_3_2_2_81_1","volume-title":"Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858","author":"Zhang Hang","year":"2023","unstructured":"Hang Zhang, Xin Li, and Lidong Bing. 2023. Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858 (2023)."},{"key":"e_1_3_2_2_82_1","volume-title":"Llama-adapter: Efficient fine-tuning of language models with zero-init attention. arXiv preprint arXiv:2303.16199","author":"Zhang Renrui","year":"2023","unstructured":"Renrui Zhang, Jiaming Han, Chris Liu, Peng Gao, Aojun Zhou, Xiangfei Hu, Shilin Yan, Pan Lu, Hongsheng Li, and Yu Qiao. 2023. Llama-adapter: Efficient fine-tuning of language models with zero-init attention. arXiv preprint arXiv:2303.16199 (2023)."},{"key":"e_1_3_2_2_83_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2862625"},{"key":"e_1_3_2_2_84_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01811"},{"key":"e_1_3_2_2_85_1","volume-title":"Multi-modal ensemble models for predicting video memorability. arXiv preprint arXiv:2102.01173","author":"Zhao Tony","year":"2021","unstructured":"Tony Zhao, Irving Fang, Jeffrey Kim, and Gerald Friedland. 2021. Multi-modal ensemble models for predicting video memorability. arXiv preprint arXiv:2102.01173 (2021)."},{"key":"e_1_3_2_2_86_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.317"},{"key":"e_1_3_2_2_87_1","volume-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680810","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680810","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:07Z","timestamp":1750295887000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680810"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":87,"alternative-id":["10.1145\/3664647.3680810","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680810","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}