{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:31:47Z","timestamp":1765308707668,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755443","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:50:47Z","timestamp":1761371447000},"page":"4446-4454","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["CrossMind-VL: Multi-Subject Mind-to-Video Decoding with Multimodal LLM Semantic Grounding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-5504-1768","authenticated-orcid":false,"given":"Xuanliu","family":"Zhu","sequence":"first","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8433-3864","authenticated-orcid":false,"given":"Yiqiao","family":"Chai","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-2220-7626","authenticated-orcid":false,"given":"Runnan","family":"Li","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0986-1336","authenticated-orcid":false,"given":"Mingying","family":"Lan","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-4402-6643","authenticated-orcid":false,"given":"Li","family":"Gao","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Advances in Neural Information Processing Systems","volume":"32","author":"Beliy Roman","year":"2019","unstructured":"Roman Beliy, Guy Gaziv, Assaf Hoogi, Francesca Strappini, Tal Golan, and Michal Irani. 2019. From voxels to pixels and back: Self-supervision in natural-image reconstruction from fMRI. Advances in Neural Information Processing Systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_2_1","volume-title":"Garnett (Eds.)","volume":"28","author":"Chen Po-Hsuan","year":"2015","unstructured":"Po-Hsuan (Cameron) Chen, Janice Chen, Yaara Yeshurun, Uri Hasson, James Haxby, and Peter J Ramadge. 2015. A Reduced-Dimension fMRI Shared Response Model. In Advances in Neural Information Processing Systems, C. Cortes, N. Lawrence, D. Lee, M. Sugiyama, and R. Garnett (Eds.), Vol. 28. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2015\/file\/b3967a0e938dc2a6340e258630febd5a-Paper.pdf"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02175"},{"key":"e_1_3_2_1_4_1","first-page":"24841","volume-title":"Levine (Eds.)","volume":"36","author":"Chen Zijiao","year":"2023","unstructured":"Zijiao Chen, Jiaxin Qing, and Juan Helen Zhou. 2023a. Cinematic Mindscapes: High-quality Video Reconstruction from Brain Activity. In Advances in Neural Information Processing Systems, A. Oh, T. Naumann, A. Globerson, K. Saenko, M. Hardt, and S. Levine (Eds.), Vol. 36. Curran Associates, Inc., 24841-24858. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/4e5e0daf4b05d8bfc6377f33fd53a8f4-Paper-Conference.pdf"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.3028167"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3263181"},{"key":"e_1_3_2_1_7_1","volume-title":"Modular encoding and decoding models derived from Bayesian canonical correlation analysis. Neural computation","author":"Fujiwara Yusuke","year":"2013","unstructured":"Yusuke Fujiwara, Yoichi Miyawaki, and Yukiyasu Kamitani. 2013. Modular encoding and decoding models derived from Bayesian canonical correlation analysis. Neural computation, Vol. 25, 4 (2013), 979-1005."},{"key":"e_1_3_2_1_8_1","first-page":"171","volume-title":"Nature","volume":"536","author":"Glasser Matthew F","year":"2016","unstructured":"Matthew F Glasser, Timothy S Coalson, Emma C Robinson, Carl D Hacker, John Harwell, Essa Yacoub, Kamil Ugurbil, Jesper Andersson, Christian F Beckmann, Mark Jenkinson, et al., 2016. A multi-modal parcellation of human cerebral cortex. Nature, Vol. 536, 7615 (2016), 171-178."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neuroimage.2019.05.039"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neuroimage.2019.05.039"},{"key":"e_1_3_2_1_11_1","first-page":"2425","volume-title":"Science","volume":"293","author":"Haxby James V","year":"2001","unstructured":"James V Haxby, M Ida Gobbini, Maura L Furey, Alumit Ishai, Jennifer L Schouten, and Pietro Pietrini. 2001. Distributed and overlapping representations of faces and objects in ventral temporal cortex. Science, Vol. 293, 5539 (2001), 2425-2430."},{"key":"e_1_3_2_1_12_1","volume-title":"Hyperalignment: Modeling shared information encoded in idiosyncratic cortical topographies. elife","author":"Haxby James V","year":"2020","unstructured":"James V Haxby, J Swaroop Guntupalli, Samuel A Nastase, and Ma Feilong. 2020a. Hyperalignment: Modeling shared information encoded in idiosyncratic cortical topographies. elife, Vol. 9 (2020), e56601."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.7554\/eLife.56601"},{"key":"e_1_3_2_1_14_1","volume-title":"Generic decoding of seen and imagined objects using hierarchical visual features. Nature communications","author":"Horikawa Tomoyasu","year":"2017","unstructured":"Tomoyasu Horikawa and Yukiyasu Kamitani. 2017. Generic decoding of seen and imagined objects using hierarchical visual features. Nature communications, Vol. 8, 1 (2017), 15037."},{"key":"e_1_3_2_1_15_1","volume-title":"Decoding the visual and subjective contents of the human brain. Nature neuroscience","author":"Kamitani Yukiyasu","year":"2005","unstructured":"Yukiyasu Kamitani and Frank Tong. 2005. Decoding the visual and subjective contents of the human brain. Nature neuroscience, Vol. 8, 5 (2005), 679-685."},{"key":"e_1_3_2_1_16_1","first-page":"352","volume-title":"Nature","volume":"452","author":"Kay Kendrick N","year":"2008","unstructured":"Kendrick N Kay, Thomas Naselaris, Ryan J Prenger, and Jack L Gallant. 2008a. Identifying natural images from human brain activity. Nature, Vol. 452, 7185 (2008), 352-355."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1038\/nature06713"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1038\/jcbfm.2012.23"},{"key":"e_1_3_2_1_19_1","volume-title":"A penny for your (visual) thoughts: Self-supervised reconstruction of natural movies from brain activity. arXiv preprint arXiv:2206.03544","author":"Kupershmidt Ganit","year":"2022","unstructured":"Ganit Kupershmidt, Roman Beliy, Guy Gaziv, and Michal Irani. 2022. A penny for your (visual) thoughts: Self-supervised reconstruction of natural movies from brain activity. arXiv preprint arXiv:2206.03544 (2022)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2018.8545855"},{"key":"e_1_3_2_1_21_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_22_1","volume-title":"International conference on machine learning. PMLR, 12888-12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888-12900."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i03.5650"},{"key":"e_1_3_2_1_24_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2023), 34892-34916."},{"key":"e_1_3_2_1_25_1","series-title":"Series B: Biological Sciences","volume-title":"The neural basis of the blood-oxygen-level-dependent functional magnetic resonance imaging signal. Philosophical Transactions of the Royal Society of London","author":"Logothetis Nikos K","year":"2002","unstructured":"Nikos K Logothetis. 2002. The neural basis of the blood-oxygen-level-dependent functional magnetic resonance imaging signal. Philosophical Transactions of the Royal Society of London. Series B: Biological Sciences, Vol. 357, 1424 (2002), 1003-1037."},{"volume-title":"The Thirteenth International Conference on Learning Representations.","author":"Lu Yizhuo","key":"e_1_3_2_1_26_1","unstructured":"Yizhuo Lu, Changde Du, Chong Wang, Xuanliu Zhu, Liuyun Jiang, Xujin Li, and Huiguang He. [n.d.]. Animate Your Thoughts: Reconstruction of Dynamic Natural Vision from Human Brain Activity. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613832"},{"key":"e_1_3_2_1_28_1","volume-title":"Informatics and data mining tools and strategies for the human connectome project. Frontiers in neuroinformatics","author":"Marcus Daniel S","year":"2011","unstructured":"Daniel S Marcus, John Harwell, Timothy Olsen, Michael Hodge, Matthew F Glasser, Fred Prior, Mark Jenkinson, Timothy Laumann, Sandra W Curtiss, and David C Van Essen. 2011. Informatics and data mining tools and strategies for the human connectome project. Frontiers in neuroinformatics, Vol. 5 (2011), 4."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neuron.2009.09.006"},{"key":"e_1_3_2_1_30_1","volume-title":"Reconstructing visual experiences from brain activity evoked by natural movies. Current biology","author":"Nishimoto Shinji","year":"2011","unstructured":"Shinji Nishimoto, An T Vu, Thomas Naselaris, Yuval Benjamini, Bin Yu, and Jack L Gallant. 2011. Reconstructing visual experiences from brain activity evoked by natural movies. Current biology, Vol. 21, 19 (2011), 1641-1646."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-023-42891-8"},{"key":"e_1_3_2_1_32_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_1_33_1","first-page":"24705","article-title":"Reconstructing the mind's eye: fmri-to-image with contrastive learning and diffusion priors","volume":"36","author":"Scotti Paul","year":"2023","unstructured":"Paul Scotti, Atmadeep Banerjee, Jimmie Goode, Stepan Shabalin, Alex Nguyen, Aidan Dempster, Nathalie Verlinde, Elad Yundler, David Weisberg, Kenneth Norman, et al., 2023. Reconstructing the mind's eye: fmri-to-image with contrastive learning and diffusion priors. Advances in Neural Information Processing Systems, Vol. 36 (2023), 24705-24728.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_34_1","volume-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538","author":"Shazeer Noam","year":"2017","unstructured":"Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. 2017. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538 (2017)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neuron.2013.06.034"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01389"},{"key":"e_1_3_2_1_37_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1093\/cercor\/bhab498"},{"key":"e_1_3_2_1_39_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_40_1","volume-title":"Neural encoding and decoding with deep learning for dynamic natural vision. Cerebral cortex","author":"Wen Haiguang","year":"2018","unstructured":"Haiguang Wen, Junxing Shi, Yizhen Zhang, Kun-Han Lu, Jiayue Cao, and Zhongming Liu. 2018. Neural encoding and decoding with deep learning for dynamic natural vision. Cerebral cortex, Vol. 28, 12 (2018), 4136-4160."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neuroimage.2004.10.034"},{"key":"e_1_3_2_1_42_1","volume-title":"Brain decoding-classification of hand written digits from fMRI data employing Bayesian networks. Frontiers in human neuroscience","author":"Yargholi Elahe'","year":"2016","unstructured":"Elahe' Yargholi and Gholam-Ali Hossein-Zadeh. 2016. Brain decoding-classification of hand written digits from fMRI data employing Bayesian networks. Frontiers in human neuroscience, Vol. 10 (2016), 351."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755443","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:29:06Z","timestamp":1765308546000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755443"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":42,"alternative-id":["10.1145\/3746027.3755443","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755443","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}