{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T16:10:08Z","timestamp":1780589408254,"version":"3.54.1"},"publisher-location":"Cham","reference-count":52,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726668","type":"print"},{"value":"9783031726675","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T00:00:00Z","timestamp":1727568000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T00:00:00Z","timestamp":1727568000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72667-5_14","type":"book-chapter","created":{"date-parts":[[2024,9,28]],"date-time":"2024-09-28T20:11:48Z","timestamp":1727554308000},"page":"242-259","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":17,"title":["UMBRAE: Unified Multimodal Brain Decoding"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0087-3525","authenticated-orcid":false,"given":"Weihao","family":"Xia","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3738-1962","authenticated-orcid":false,"given":"Raoul","family":"de Charette","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4700-2236","authenticated-orcid":false,"given":"Cengiz","family":"Oztireli","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1174-610X","authenticated-orcid":false,"given":"Jing-Hao","family":"Xue","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,9,29]]},"reference":[{"issue":"1","key":"14_CR1","doi-asserted-by":"publisher","first-page":"25","DOI":"10.5898\/JHRI.6.1.Admoni","volume":"6","author":"H Admoni","year":"2017","unstructured":"Admoni, H., Scassellati, B.: Social eye gaze in human-robot interaction: a review. J. Human-Robot Interact. 6(1), 25\u201363 (2017)","journal-title":"J. Human-Robot Interact."},{"issue":"1","key":"14_CR2","doi-asserted-by":"publisher","first-page":"116","DOI":"10.1038\/s41593-021-00962-x","volume":"25","author":"EJ Allen","year":"2022","unstructured":"Allen, E.J., et al.: A massive 7T fMRI dataset to bridge cognitive neuroscience and artificial intelligence. Nat. Neurosci. 25(1), 116\u2013126 (2022)","journal-title":"Nat. Neurosci."},{"key":"14_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"382","DOI":"10.1007\/978-3-319-46454-1_24","volume-title":"Computer Vision \u2013 ECCV 2016","author":"P Anderson","year":"2016","unstructured":"Anderson, P., Fernando, B., Johnson, M., Gould, S.: SPICE: semantic propositional image caption evaluation. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9909, pp. 382\u2013398. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46454-1_24"},{"key":"14_CR4","unstructured":"Banerjee, S., Lavie, A.: Meteor: an automatic metric for MT evaluation with improved correlation with human judgments. In: ACL Workshop, pp. 65\u201372 (2005)"},{"key":"14_CR5","first-page":"9912","volume":"33","author":"M Caron","year":"2020","unstructured":"Caron, M., Misra, I., Mairal, J., Goyal, P., Bojanowski, P., Joulin, A.: Unsupervised learning of visual features by contrasting cluster assignments. NeurIPS 33, 9912\u20139924 (2020)","journal-title":"NeurIPS"},{"issue":"1","key":"14_CR6","doi-asserted-by":"publisher","first-page":"1236","DOI":"10.1038\/s41467-022-28859-8","volume":"13","author":"U Chaudhary","year":"2022","unstructured":"Chaudhary, U., et al.: Spelling interface using intracortical signals in a completely locked-in patient enabled via auditory neurofeedback training. Nat. Commun. 13(1), 1236 (2022)","journal-title":"Nat. Commun."},{"key":"14_CR7","unstructured":"Chen, K., Zhang, Z., Zeng, W., Zhang, R., Zhu, F., Zhao, R.: Shikra: unleashing multimodal LLM\u2019s referential dialogue magic. arXiv preprint arXiv:2306.15195 (2023)"},{"key":"14_CR8","unstructured":"Chen, Z., Qing, J., Zhou, J.H.: Cinematic mindscapes: high-quality video reconstruction from brain activity. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"14_CR9","unstructured":"Chiang, W.L., et\u00a0al.: Vicuna: an open-source chatbot impressing GPT-4 with 90%* ChatGPT quality 1(2), 3 (2023). https:\/\/lmsysorg\/blog\/2023-03-30-vicuna"},{"key":"14_CR10","doi-asserted-by":"crossref","unstructured":"Cui, Y., Zhang, Q., Knox, B., Allievi, A., Stone, P., Niekum, S.: The empathic framework for task learning from implicit human feedback. In: CoRL, pp. 604\u2013626. PMLR (2021)","DOI":"10.1609\/aaai.v35i18.17998"},{"issue":"8","key":"14_CR11","doi-asserted-by":"publisher","first-page":"2051","DOI":"10.1523\/JNEUROSCI.04-08-02051.1984","volume":"4","author":"R Desimone","year":"1984","unstructured":"Desimone, R., Albright, T.D., Gross, C.G., Bruce, C.: Stimulus-selective properties of inferior temporal neurons in the macaque. J. Neurosci. 4(8), 2051\u20132062 (1984)","journal-title":"J. Neurosci."},{"key":"14_CR12","unstructured":"Ferrante, M., Ozcelik, F., Boccato, T., VanRullen, R., Toschi, N.: Brain captioning: decoding human brain activity into images and text. arXiv preprint arXiv:2305.11560 (2023)"},{"key":"14_CR13","doi-asserted-by":"crossref","unstructured":"Girdhar, R., et al.: ImageBind: one embedding space to bind them all. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"14_CR14","doi-asserted-by":"crossref","unstructured":"Han, J., et al.: OneLLM: one framework to align all modalities with language. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.02510"},{"key":"14_CR15","doi-asserted-by":"crossref","unstructured":"Hessel, J., Holtzman, A., Forbes, M., Bras, R.L., Choi, Y.: ClipScore: a reference-free evaluation metric for image captioning. In: EMNLP (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"issue":"2","key":"14_CR16","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3054912","volume":"50","author":"A Hussein","year":"2017","unstructured":"Hussein, A., Gaber, M.M., Elyan, E., Jayne, C.: Imitation learning: a survey of learning methods. CSUR 50(2), 1\u201335 (2017)","journal-title":"CSUR"},{"key":"14_CR17","unstructured":"Jaegle, A., Gimeno, F., Brock, A., Vinyals, O., Zisserman, A., Carreira, J.: Perceiver: general perception with iterative attention. In: ICML, pp. 4651\u20134664. PMLR (2021)"},{"issue":"11","key":"14_CR18","doi-asserted-by":"publisher","first-page":"4302","DOI":"10.1523\/JNEUROSCI.17-11-04302.1997","volume":"17","author":"N Kanwisher","year":"1997","unstructured":"Kanwisher, N., McDermott, J., Chun, M.M.: The fusiform face area: a module in human extrastriate cortex specialized for face perception. J. Neurosci. 17(11), 4302\u20134311 (1997)","journal-title":"J. Neurosci."},{"key":"14_CR19","doi-asserted-by":"crossref","unstructured":"Karras, T., Laine, S., Aittala, M., Hellsten, J., Lehtinen, J., Aila, T.: Analyzing and improving the image quality of StyleGAN. In: CVPR, pp. 8107\u20138116 (2020)","DOI":"10.1109\/CVPR42600.2020.00813"},{"issue":"6","key":"14_CR20","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1145\/3065386","volume":"60","author":"A Krizhevsky","year":"2017","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: ImageNet classification with deep convolutional neural networks. Commun. ACM 60(6), 84\u201390 (2017)","journal-title":"Commun. ACM"},{"key":"14_CR21","doi-asserted-by":"publisher","first-page":"495","DOI":"10.1016\/S0079-6123(05)50034-7","volume":"150","author":"S Laureys","year":"2005","unstructured":"Laureys, S., et al.: The locked-in syndrome: what is it like to be conscious but paralyzed and voiceless? Prog. Brain Res. 150, 495\u2013611 (2005)","journal-title":"Prog. Brain Res."},{"key":"14_CR22","unstructured":"Lee, S., et\u00a0al.: NOIR: neural signal operated intelligent robots for everyday activities. In: CoRL (2023)"},{"key":"14_CR23","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: GLIGEN: open-set grounded text-to-image generation. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"14_CR24","unstructured":"Lin, C.Y.: ROUGE: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381 (2004)"},{"key":"14_CR25","first-page":"29624","volume":"35","author":"S Lin","year":"2022","unstructured":"Lin, S., Sprague, T., Singh, A.K.: Mind Reader: reconstructing complex images from brain activities. NeurIPS 35, 29624\u201329636 (2022)","journal-title":"NeurIPS"},{"key":"14_CR26","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"14_CR27","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: NeurIPS, vol. 36 (2023)"},{"key":"14_CR28","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: ICLR (2019)"},{"key":"14_CR29","unstructured":"Mai, W., Zhang, Z.: UniBrain: unify image reconstruction and captioning all in one diffusion model from human brain activity. arXiv preprint arXiv:2308.07428 (2023)"},{"key":"14_CR30","unstructured":"Oord, A.V.D., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"issue":"1","key":"14_CR31","doi-asserted-by":"publisher","first-page":"15666","DOI":"10.1038\/s41598-023-42891-8","volume":"13","author":"F Ozcelik","year":"2023","unstructured":"Ozcelik, F., VanRullen, R.: Brain-Diffuser: natural scene reconstruction from fMRI signals using generative latent diffusion. Sci. Rep. 13(1), 15666 (2023)","journal-title":"Sci. Rep."},{"key":"14_CR32","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: ACL, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"14_CR33","unstructured":"Podell, D., et al.: SDXL: improving latent diffusion models for high-resolution image synthesis. In: ICLR (2024)"},{"issue":"16","key":"14_CR34","doi-asserted-by":"publisher","first-page":"5205","DOI":"10.1523\/JNEUROSCI.16-16-05205.1996","volume":"16","author":"A Puce","year":"1996","unstructured":"Puce, A., Allison, T., Asgari, M., Gore, J.C., McCarthy, G.: Differential sensitivity of human visual cortex to faces, letterstrings, and textures: a functional magnetic resonance imaging study. J. Neurosci. 16(16), 5205\u20135215 (1996)","journal-title":"J. Neurosci."},{"key":"14_CR35","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML, pp. 8748\u20138763. PMLR (2021)"},{"key":"14_CR36","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"14_CR37","unstructured":"Scotti, P.S., et\u00a0al.: Reconstructing the mind\u2019s eye: fMRI-to-image with contrastive learning and diffusion priors. In: NeurIPS (2023)"},{"key":"14_CR38","doi-asserted-by":"crossref","unstructured":"Smith, L.N., Topin, N.: Super-convergence: very fast training of neural networks using large learning rates. In: Artificial Intelligence and Machine Learning for Multi-domain Operations Applications, vol. 11006, pp. 369\u2013386. SPIE (2019)","DOI":"10.1117\/12.2520589"},{"key":"14_CR39","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z.: Rethinking the inception architecture for computer vision. In: CVPR, pp. 2818\u20132826 (2016)","DOI":"10.1109\/CVPR.2016.308"},{"key":"14_CR40","doi-asserted-by":"crossref","unstructured":"Takagi, Y., Nishimoto, S.: High-resolution image reconstruction with latent diffusion models from human brain activity. In: CVPR, pp. 14453\u201314463 (2023)","DOI":"10.1109\/CVPR52729.2023.01389"},{"key":"14_CR41","unstructured":"Takagi, Y., Nishimoto, S.: Improving visual image reconstruction from human brain activity using latent diffusion models via multiple decoded inputs. arXiv preprint arXiv:2306.11536 (2023)"},{"key":"14_CR42","unstructured":"Tan, M., Le, Q.: EfficientNet: rethinking model scaling for convolutional neural networks. In: ICML, pp. 6105\u20136114. PMLR (2019)"},{"key":"14_CR43","doi-asserted-by":"publisher","first-page":"858","DOI":"10.1038\/s41593-023-01304-9","volume":"26","author":"J Tang","year":"2023","unstructured":"Tang, J., LeBel, A., Jain, S., Huth, A.G.: Semantic reconstruction of continuous language from non-invasive brain recordings. Nat. Neurosci. 26, 858\u2013866 (2023)","journal-title":"Nat. Neurosci."},{"key":"14_CR44","unstructured":"Touvron, H., et\u00a0al.: LlaMA: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"issue":"1","key":"14_CR45","doi-asserted-by":"publisher","first-page":"55","DOI":"10.1038\/nrn3857","volume":"16","author":"LQ Uddin","year":"2015","unstructured":"Uddin, L.Q.: Salience processing and insular cortical function and dysfunction. Nat. Rev. Neurosci. 16(1), 55\u201361 (2015)","journal-title":"Nat. Rev. Neurosci."},{"key":"14_CR46","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS, vol. 30 (2017)"},{"key":"14_CR47","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., Parikh, D.: CIDEr: consensus-based image description evaluation. In: CVPR, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"issue":"4","key":"14_CR48","first-page":"600","volume":"13","author":"Z Wang","year":"2004","unstructured":"Wang, Z., Bovik, A.C., Sheikh, H.R., Simoncelli, E.P.: Image quality assessment: from error visibility to structural similarity. TIP 13(4), 600\u2013612 (2004)","journal-title":"TIP"},{"key":"14_CR49","doi-asserted-by":"crossref","unstructured":"Xia, W., de\u00a0Charette, R., \u00d6ztireli, C., Xue, J.H.: DREAM: visual decoding from reversing human visual system. In: WACV, pp. 8226\u20138235 (2024)","DOI":"10.1109\/WACV57701.2024.00804"},{"key":"14_CR50","doi-asserted-by":"crossref","unstructured":"Xia, W., Yang, Y., Xue, J.H., Feng, W.: Controllable continuous gaze redirection. In: ACM MM, pp. 1782\u20131790 (2020)","DOI":"10.1145\/3394171.3413868"},{"key":"14_CR51","doi-asserted-by":"crossref","unstructured":"Xu, X., Wang, Z., Zhang, E., Wang, K., Shi, H.: Versatile diffusion: text, images and variations all in one diffusion model. In: ICCV, pp. 7754\u20137765 (2023)","DOI":"10.1109\/ICCV51070.2023.00713"},{"key":"14_CR52","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/978-3-319-46475-6_5","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Yu","year":"2016","unstructured":"Yu, L., Poirson, P., Yang, S., Berg, A.C., Berg, T.L.: Modeling context in referring expressions. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9906, pp. 69\u201385. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46475-6_5"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72667-5_14","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,28]],"date-time":"2024-09-28T20:15:27Z","timestamp":1727554527000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72667-5_14"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,29]]},"ISBN":["9783031726668","9783031726675"],"references-count":52,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72667-5_14","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,9,29]]},"assertion":[{"value":"29 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}