{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T20:49:50Z","timestamp":1776113390868,"version":"3.50.1"},"publisher-location":"Cham","reference-count":102,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726729","type":"print"},{"value":"9783031726736","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,22]],"date-time":"2024-10-22T00:00:00Z","timestamp":1729555200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,22]],"date-time":"2024-10-22T00:00:00Z","timestamp":1729555200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72673-6_8","type":"book-chapter","created":{"date-parts":[[2024,10,21]],"date-time":"2024-10-21T16:03:50Z","timestamp":1729526630000},"page":"135-155","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["LEGO: Learning EGOcentric Action Frame Generation via\u00a0Visual Instruction Tuning"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7578-7336","authenticated-orcid":false,"given":"Bolin","family":"Lai","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaoliang","family":"Dai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lawrence","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guan","family":"Pang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"James M.","family":"Rehg","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Miao","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,22]]},"reference":[{"key":"8_CR1","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. In: Advances in Neural Information Processing Systems, vol. 35, pp. 23716\u201323736 (2022)"},{"key":"8_CR2","unstructured":"Anonymity: Making multimodal generation easier: when diffusion models meet LLMs. Openreview (2023)"},{"key":"8_CR3","doi-asserted-by":"crossref","unstructured":"Ashutosh, K., Girdhar, R., Torresani, L., Grauman, K.: Hiervl: learning hierarchical video-language embeddings. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23066\u201323078 (2023)","DOI":"10.1109\/CVPR52729.2023.02209"},{"issue":"2\u20133","key":"8_CR4","doi-asserted-by":"publisher","first-page":"239","DOI":"10.1177\/009885880703300205","volume":"33","author":"JH Baskin","year":"2007","unstructured":"Baskin, J.H., Edersheim, J.G., Price, B.H.: Is a picture worth a thousand words? Neuroimaging in the courtroom. Am. J. Law Med. 33(2\u20133), 239\u2013269 (2007)","journal-title":"Am. J. Law Med."},{"key":"8_CR5","doi-asserted-by":"crossref","unstructured":"Brooks, T., Holynski, A., Efros, A.A.: Instructpix2pix: learning to follow image editing instructions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18392\u201318402 (2023)","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"8_CR6","unstructured":"Brown, T., et al.: Language models are few-shot learners. In: Advances in Neural Information Processing Systems, vol. 33, pp. 1877\u20131901 (2020)"},{"key":"8_CR7","doi-asserted-by":"crossref","unstructured":"Chakrabarty, T., Singh, K., Saakyan, A., Muresan, S.: Learning to follow object-centric image editing instructions faithfully. arXiv preprint arXiv:2310.19145 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.646"},{"key":"8_CR8","unstructured":"Chen, J., et al.: Pixart-alpha: fast training of diffusion transformer for photorealistic text-to-image synthesis. arXiv preprint arXiv:2310.00426 (2023)"},{"key":"8_CR9","unstructured":"Chen, W.G., Spiridonova, I., Yang, J., Gao, J., Li, C.: Llava-interactive: an all-in-one demo for image chat, segmentation, generation and editing. arXiv preprint arXiv:2311.00571 (2023)"},{"key":"8_CR10","unstructured":"Chowdhery, A., et al.: Palm: scaling language modeling with pathways. arXiv preprint arXiv:2204.02311 (2022)"},{"key":"8_CR11","unstructured":"Couairon, G., Verbeek, J., Schwenk, H., Cord, M.: Diffedit: diffusion-based semantic image editing with mask guidance. arXiv preprint arXiv:2210.11427 (2022)"},{"key":"8_CR12","unstructured":"Dai, W., et al.: Instructblip: towards general-purpose vision-language models with instruction tuning. In: Advances in Neural Information Processing Systems (2023)"},{"key":"8_CR13","doi-asserted-by":"crossref","unstructured":"Damen, D., et al.: Rescaling egocentric vision: collection, pipeline and challenges for epic-kitchens-100. Int. J. Comput. Vision 1\u201323 (2022)","DOI":"10.1007\/s11263-021-01531-2"},{"key":"8_CR14","unstructured":"Du, Y., et al.: Learning universal policies via text-guided video generation. In: Thirty-Seventh Conference on Neural Information Processing Systems (2023)"},{"key":"8_CR15","unstructured":"Epstein, D., Jabri, A., Poole, B., Efros, A.A., Holynski, A.: Diffusion self-guidance for controllable image generation. In: Advances in Neural Information Processing Systems (2023)"},{"key":"8_CR16","doi-asserted-by":"crossref","unstructured":"Girdhar, R., Grauman, K.: Anticipative video transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13505\u201313515 (2021)","DOI":"10.1109\/ICCV48922.2021.01325"},{"key":"8_CR17","unstructured":"Glorot, X., Bengio, Y.: Understanding the difficulty of training deep feedforward neural networks. In: Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics, pp. 249\u2013256. JMLR Workshop and Conference Proceedings (2010)"},{"key":"8_CR18","unstructured":"Goel, V., et al.: Pair-diffusion: object-level image editing with structure-and-appearance paired diffusion models. arXiv preprint arXiv:2303.17546 (2023)"},{"issue":"11","key":"8_CR19","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1145\/3422622","volume":"63","author":"I Goodfellow","year":"2020","unstructured":"Goodfellow, I., et al.: Generative adversarial networks. Commun. ACM 63(11), 139\u2013144 (2020)","journal-title":"Commun. ACM"},{"key":"8_CR20","doi-asserted-by":"crossref","unstructured":"Goyal, M., Modi, S., Goyal, R., Gupta, S.: Human hands as probes for interactive object understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3293\u20133303 (2022)","DOI":"10.1109\/CVPR52688.2022.00329"},{"key":"8_CR21","unstructured":"Grauman, K., et al.: Ego4D: around the world in 3,000 hours of egocentric video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18995\u201319012 (2022)"},{"issue":"11","key":"8_CR22","doi-asserted-by":"publisher","first-page":"3056","DOI":"10.1523\/JNEUROSCI.2496-16.2017","volume":"37","author":"A Hafri","year":"2017","unstructured":"Hafri, A., Trueswell, J.C., Epstein, R.A.: Neural representations of observed actions generalize across static and dynamic visual input. J. Neurosci. 37(11), 3056\u20133071 (2017)","journal-title":"J. Neurosci."},{"key":"8_CR23","unstructured":"Han, J., et al.: Imagebind-LLM: multi-modality instruction tuning. arXiv preprint arXiv:2309.03905 (2023)"},{"key":"8_CR24","doi-asserted-by":"crossref","unstructured":"Han, L., et al.: Proxedit: improving tuning-free real image editing with proximal guidance. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 4291\u20134301 (2024)","DOI":"10.1109\/WACV57701.2024.00424"},{"key":"8_CR25","unstructured":"Hertz, A., Mokady, R., Tenenbaum, J., Aberman, K., Pritch, Y., Cohen-Or, D.: Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626 (2022)"},{"key":"8_CR26","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local nash equilibrium. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"8_CR27","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: Advances in Neural Information Processing Systems, vol. 33, pp. 6840\u20136851 (2020)"},{"key":"8_CR28","doi-asserted-by":"crossref","unstructured":"Huang, J., Liu, Y., Qin, J., Chen, S.: KV inversion: KV embeddings learning for text-conditioned real image action editing. arXiv preprint arXiv:2309.16608 (2023)","DOI":"10.1007\/978-981-99-8429-9_14"},{"key":"8_CR29","doi-asserted-by":"publisher","first-page":"7795","DOI":"10.1109\/TIP.2020.3007841","volume":"29","author":"Y Huang","year":"2020","unstructured":"Huang, Y., Cai, M., Li, Z., Lu, F., Sato, Y.: Mutual context network for jointly estimating egocentric gaze and action. IEEE Trans. Image Process. 29, 7795\u20137806 (2020)","journal-title":"IEEE Trans. Image Process."},{"key":"8_CR30","doi-asserted-by":"crossref","unstructured":"Huang, Y., Cai, M., Li, Z., Sato, Y.: Predicting gaze in egocentric video by learning task-dependent attention transition. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 754\u2013769 (2018)","DOI":"10.1007\/978-3-030-01225-0_46"},{"key":"8_CR31","unstructured":"Iandola, F.N., Han, S., Moskewicz, M.W., Ashraf, K., Dally, W.J., Keutzer, K.: Squeezenet: alexnet-level accuracy with 50x fewer parameters and< 0.5 mb model size. arXiv preprint arXiv:1602.07360 (2016)"},{"key":"8_CR32","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"639","DOI":"10.1007\/978-3-031-19778-9_37","volume-title":"ECCV 2022","author":"W Jia","year":"2022","unstructured":"Jia, W., Liu, M., Rehg, J.M.: Generative adversarial network for future hand segmentation from egocentric video. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13673, pp. 639\u2013656. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19778-9_37"},{"key":"8_CR33","doi-asserted-by":"crossref","unstructured":"Jiang, Y., Zhang, Z., Xue, T., Gu, J.: Autodir: automatic all-in-one image restoration with latent diffusion. arXiv preprint arXiv:2310.10123 (2023)","DOI":"10.1007\/978-3-031-73661-2_19"},{"key":"8_CR34","doi-asserted-by":"crossref","unstructured":"Joseph, K., et al.: Iterative multi-granular image editing using diffusion models. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (2024)","DOI":"10.1109\/WACV57701.2024.00792"},{"key":"8_CR35","unstructured":"Kawar, B., Elad, M., Ermon, S., Song, J.: Denoising diffusion restoration models. In: Advances in Neural Information Processing Systems (2022)"},{"key":"8_CR36","doi-asserted-by":"crossref","unstructured":"Kawar, B., et al.: Imagic: text-based real image editing with diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6007\u20136017 (2023)","DOI":"10.1109\/CVPR52729.2023.00582"},{"key":"8_CR37","doi-asserted-by":"crossref","unstructured":"Kazakos, E., Nagrani, A., Zisserman, A., Damen, D.: Epic-fusion: audio-visual temporal binding for egocentric action recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5492\u20135501 (2019)","DOI":"10.1109\/ICCV.2019.00559"},{"key":"8_CR38","unstructured":"Kim, S., et al.: User-friendly image editing with minimal text input: leveraging captioning and injection techniques. arXiv preprint arXiv:2306.02717 (2023)"},{"key":"8_CR39","unstructured":"Kirillov, A., et al.: Segment anything. arXiv preprint arXiv:2304.02643 (2023)"},{"key":"8_CR40","unstructured":"Koh, J.Y., Fried, D., Salakhutdinov, R.R.: Generating images with multimodal language models. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"8_CR41","doi-asserted-by":"crossref","unstructured":"Lai, B., Liu, M., Ryan, F., Rehg, J.M.: In the eye of transformer: global-local correlation for egocentric gaze estimation. In: British Machine Vision Conference (2022)","DOI":"10.1007\/s11263-023-01879-7"},{"issue":"3","key":"8_CR42","doi-asserted-by":"publisher","first-page":"854","DOI":"10.1007\/s11263-023-01879-7","volume":"132","author":"B Lai","year":"2024","unstructured":"Lai, B., Liu, M., Ryan, F., Rehg, J.M.: In the eye of transformer: global-local correlation for egocentric gaze estimation and beyond. Int. J. Comput. Vision 132(3), 854\u2013871 (2024)","journal-title":"Int. J. Comput. Vision"},{"key":"8_CR43","doi-asserted-by":"crossref","unstructured":"Lai, B., Ryan, F., Jia, W., Liu, M., Rehg, J.M.: Listen to look into the future: audio-visual egocentric gaze anticipation. arXiv preprint arXiv:2305.03907 (2023)","DOI":"10.1007\/978-3-031-72673-6_11"},{"key":"8_CR44","unstructured":"Li, D., Li, J., Hoi, S.C.: Blip-diffusion: pre-trained subject representation for controllable text-to-image generation and editing. In: Advances in Neural Information Processing Systems (2023)"},{"key":"8_CR45","doi-asserted-by":"crossref","unstructured":"Li, J., Liu, K., Wu, J.: Ego-body pose estimation via ego-head pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17142\u201317151 (2023)","DOI":"10.1109\/CVPR52729.2023.01644"},{"key":"8_CR46","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning (2023)"},{"key":"8_CR47","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"8_CR48","doi-asserted-by":"crossref","unstructured":"Li, Y., Liu, M., Rehg, J.M.: In the eye of beholder: joint learning of gaze and actions in first person video. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 619\u2013635 (2018)","DOI":"10.1007\/978-3-030-01228-1_38"},{"key":"8_CR49","unstructured":"Lian, L., Li, B., Yala, A., Darrell, T.: LLM-grounded diffusion: enhancing prompt understanding of text-to-image diffusion models with large language models. arXiv preprint arXiv:2305.13655 (2023)"},{"key":"8_CR50","unstructured":"Lin, K.Q., et al.: Egocentric video-language pretraining. In: Advances in Neural Information Processing Systems, vol. 35, pp. 7575\u20137586 (2022)"},{"key":"8_CR51","doi-asserted-by":"crossref","unstructured":"Liu, B., Zhang, H., Liu, J., Wang, Q.: Acigs: an automated large-scale crops image generation system based on large visual language multi-modal models. In: 2023 20th Annual IEEE International Conference on Sensing, Communication, and Networking (SECON), pp. 7\u201313. IEEE (2023)","DOI":"10.1109\/SECON58729.2023.10287530"},{"key":"8_CR52","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Advances in Neural Information Processing Systems (2023)"},{"key":"8_CR53","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"704","DOI":"10.1007\/978-3-030-58452-8_41","volume-title":"Computer Vision \u2013 ECCV 2020","author":"M Liu","year":"2020","unstructured":"Liu, M., Tang, S., Li, Y., Rehg, J.M.: Forecasting human-object interaction: joint prediction of motor attention and\u00a0actions in first person video. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 704\u2013721. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_41"},{"key":"8_CR54","doi-asserted-by":"crossref","unstructured":"Liu, S., Tripathi, S., Majumdar, S., Wang, X.: Joint hand motion and interaction hotspots prediction from egocentric videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3282\u20133292 (2022)","DOI":"10.1109\/CVPR52688.2022.00328"},{"key":"8_CR55","unstructured":"Luo, Z., Hachiuma, R., Yuan, Y., Kitani, K.: Dynamics-regulated kinematic policy for egocentric pose estimation. In: Advances in Neural Information Processing Systems, vol. 34, pp. 25019\u201325032 (2021)"},{"key":"8_CR56","unstructured":"Meng, C., et al.: SDEdit: guided image synthesis and editing with stochastic differential equations. In: International Conference on Learning Representations (2022)"},{"key":"8_CR57","doi-asserted-by":"crossref","unstructured":"Mirzaei, A., et al.: Watch your steps: local image and scene editing by text instructions. arXiv preprint arXiv:2308.08947 (2023)","DOI":"10.1007\/978-3-031-72920-1_7"},{"key":"8_CR58","doi-asserted-by":"crossref","unstructured":"Mokady, R., Hertz, A., Aberman, K., Pritch, Y., Cohen-Or, D.: Null-text inversion for editing real images using guided diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6038\u20136047 (2023)","DOI":"10.1109\/CVPR52729.2023.00585"},{"key":"8_CR59","unstructured":"Molad, E., et al.: Dreamix: video diffusion models are general video editors. arXiv preprint arXiv:2302.01329 (2023)"},{"key":"8_CR60","doi-asserted-by":"crossref","unstructured":"Orgad, H., Kawar, B., Belinkov, Y.: Editing implicit assumptions in text-to-image diffusion models. arXiv preprint arXiv:2303.08084 (2023)","DOI":"10.1109\/ICCV51070.2023.00649"},{"key":"8_CR61","doi-asserted-by":"crossref","unstructured":"Pan, Z., Gherardi, R., Xie, X., Huang, S.: Effective real image editing with accelerated iterative diffusion inversion. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15912\u201315921 (2023)","DOI":"10.1109\/ICCV51070.2023.01458"},{"key":"8_CR62","doi-asserted-by":"crossref","unstructured":"Pramanick, S., et al.: Egovlpv2: egocentric video-language pre-training with fusion in the backbone. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5285\u20135297 (2023)","DOI":"10.1109\/ICCV51070.2023.00487"},{"key":"8_CR63","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"issue":"1","key":"8_CR64","first-page":"5485","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(1), 5485\u20135551 (2020)","journal-title":"J. Mach. Learn. Res."},{"key":"8_CR65","doi-asserted-by":"crossref","unstructured":"Ragusa, F., Farinella, G.M., Furnari, A.: Stillfast: an end-to-end approach for short-term object interaction anticipation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3635\u20133644 (2023)","DOI":"10.1109\/CVPRW59228.2023.00371"},{"key":"8_CR66","doi-asserted-by":"crossref","unstructured":"Ramakrishnan, S.K., Al-Halah, Z., Grauman, K.: NAQ: leveraging narrations as queries to supervise episodic memory. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6694\u20136703 (2023)","DOI":"10.1109\/CVPR52729.2023.00647"},{"key":"8_CR67","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"8_CR68","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., Aberman, K.: Dreambooth: fine tuning text-to-image diffusion models for subject-driven generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22500\u201322510 (2023)","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"8_CR69","doi-asserted-by":"crossref","unstructured":"Ryan, F., Jiang, H., Shukla, A., Rehg, J.M., Ithapu, V.K.: Egocentric auditory attention localization in conversations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14663\u201314674 (2023)","DOI":"10.1109\/CVPR52729.2023.01409"},{"key":"8_CR70","unstructured":"Shen, Y., Song, K., Tan, X., Li, D., Lu, W., Zhuang, Y.: Hugginggpt: solving AI tasks with chatgpt and its friends in huggingface. arXiv preprint arXiv:2303.17580 (2023)"},{"key":"8_CR71","doi-asserted-by":"crossref","unstructured":"Shi, J., Xiong, W., Lin, Z., Jung, H.J.: Instantbooth: personalized text-to-image generation without test-time finetuning. arXiv preprint arXiv:2304.03411 (2023)","DOI":"10.1109\/CVPR52733.2024.00816"},{"key":"8_CR72","unstructured":"Stein, G., et al.: Exposing flaws of generative model evaluation metrics and their unfair treatment of diffusion models. arXiv preprint arXiv:2306.04675 (2023)"},{"key":"8_CR73","unstructured":"Su, Y., Lan, T., Li, H., Xu, J., Wang, Y., Cai, D.: Pandagpt: one model to instruction-follow them all. arXiv preprint arXiv:2305.16355 (2023)"},{"key":"8_CR74","doi-asserted-by":"crossref","unstructured":"Sudhakaran, S., Escalera, S., Lanz, O.: LSTA: long short-term attention for egocentric action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9954\u20139963 (2019)","DOI":"10.1109\/CVPR.2019.01019"},{"key":"8_CR75","doi-asserted-by":"crossref","unstructured":"Sun, Z., Zhou, Y., He, H., Mok, P.: Sgdiff: a style guided diffusion model for fashion synthesis. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 8433\u20138442 (2023)","DOI":"10.1145\/3581783.3613806"},{"key":"8_CR76","unstructured":"Thoppilan, R., et\u00a0al.: Lamda: language models for dialog applications. arXiv preprint arXiv:2201.08239 (2022)"},{"issue":"6","key":"8_CR77","doi-asserted-by":"publisher","first-page":"6794","DOI":"10.1109\/TPAMI.2020.3029700","volume":"45","author":"D Tome","year":"2020","unstructured":"Tome, D., et al.: Selfpose: 3D egocentric pose estimation from a headset mounted camera. IEEE Trans. Pattern Anal. Mach. Intell. 45(6), 6794\u20136806 (2020)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"8_CR78","unstructured":"Touvron, H., et al.: Llama: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"8_CR79","unstructured":"Tsaban, L., Passos, A.: Ledits: real image editing with DDPM inversion and semantic guidance. arXiv preprint arXiv:2307.00522 (2023)"},{"key":"8_CR80","doi-asserted-by":"crossref","unstructured":"Wallace, B., Gokul, A., Naik, N.: Edict: exact diffusion inversion via coupled transformations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22532\u201322541 (2023)","DOI":"10.1109\/CVPR52729.2023.02158"},{"key":"8_CR81","doi-asserted-by":"crossref","unstructured":"Wang, J., Luvizon, D., Xu, W., Liu, L., Sarkar, K., Theobalt, C.: Scene-aware egocentric 3D human pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13031\u201313040 (2023)","DOI":"10.1109\/CVPR52729.2023.01252"},{"key":"8_CR82","unstructured":"Wang, K., Yang, F., Yang, S., Butt, M.A., van\u00a0de Weijer, J.: Dynamic prompt learning: addressing cross-attention leakage for text-based image editing. arXiv preprint arXiv:2309.15664 (2023)"},{"key":"8_CR83","unstructured":"Wang, Q., Zhang, B., Birsak, M., Wonka, P.: Instructedit: improving automatic masks for diffusion-based image editing with user instructions. arXiv preprint arXiv:2305.18047 (2023)"},{"key":"8_CR84","unstructured":"Wang, Q., Zhang, B., Birsak, M., Wonka, P.: MDP: a generalized framework for text-guided image editing by manipulating the diffusion path. arXiv preprint arXiv:2303.16765 (2023)"},{"key":"8_CR85","unstructured":"Wang, W., et al.: Zero-shot video editing using off-the-shelf image diffusion models. arXiv preprint arXiv:2303.17599 (2023)"},{"key":"8_CR86","doi-asserted-by":"crossref","unstructured":"Wang, X., Zhu, L., Wang, H., Yang, Y.: Interactive prototype learning for egocentric action recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8168\u20138177 (2021)","DOI":"10.1109\/ICCV48922.2021.00806"},{"key":"8_CR87","doi-asserted-by":"crossref","unstructured":"Wei, Y., Zhang, Y., Ji, Z., Bai, J., Zhang, L., Zuo, W.: Elite: encoding visual concepts into textual embeddings for customized text-to-image generation. arXiv preprint arXiv:2302.13848 (2023)","DOI":"10.1109\/ICCV51070.2023.01461"},{"key":"8_CR88","unstructured":"Wen, S., Fang, G., Zhang, R., Gao, P., Dong, H., Metaxas, D.: Improving compositional text-to-image generation with large vision-language models. arXiv preprint arXiv:2310.06311 (2023)"},{"key":"8_CR89","unstructured":"Wu, C., Yin, S., Qi, W., Wang, X., Tang, Z., Duan, N.: Visual chatgpt: talking, drawing and editing with visual foundation models. arXiv preprint arXiv:2303.04671 (2023)"},{"key":"8_CR90","unstructured":"Wu, S., Fei, H., Qu, L., Ji, W., Chua, T.S.: Next-GPT: any-to-any multimodal LLM. arXiv preprint arXiv:2309.05519 (2023)"},{"key":"8_CR91","doi-asserted-by":"crossref","unstructured":"Xu, Y., et al.: Egopca: a new framework for egocentric hand-object interaction understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5273\u20135284 (2023)","DOI":"10.1109\/ICCV51070.2023.00486"},{"key":"8_CR92","doi-asserted-by":"crossref","unstructured":"Ye, Y., et al.: Affordance diffusion: synthesizing hand-object interactions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22479\u201322489 (2023)","DOI":"10.1109\/CVPR52729.2023.02153"},{"key":"8_CR93","unstructured":"Yu, Q., Li, J., Ye, W., Tang, S., Zhuang, Y.: Interactive data synthesis for systematic vision adaptation via LLMs-AIGCs collaboration. arXiv preprint arXiv:2305.12799 (2023)"},{"key":"8_CR94","doi-asserted-by":"crossref","unstructured":"Yu, Z., Li, H., Fu, F., Miao, X., Cui, B.: Fisedit: accelerating text-to-image editing via cache-enabled sparse diffusion inference. arXiv preprint arXiv:2305.17423 (2023)","DOI":"10.1609\/aaai.v38i15.29599"},{"key":"8_CR95","doi-asserted-by":"crossref","unstructured":"Zhang, H., Li, X., Bing, L.: Video-llama: an instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858 (2023)","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"8_CR96","doi-asserted-by":"crossref","unstructured":"Zhang, M., Teck\u00a0Ma, K., Hwee\u00a0Lim, J., Zhao, Q., Feng, J.: Deep future gaze: gaze anticipation on egocentric videos using adversarial networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4372\u20134381 (2017)","DOI":"10.1109\/CVPR.2017.377"},{"key":"8_CR97","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., Efros, A.A., Shechtman, E., Wang, O.: The unreasonable effectiveness of deep features as a perceptual metric. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 586\u2013595 (2018)","DOI":"10.1109\/CVPR.2018.00068"},{"key":"8_CR98","unstructured":"Zhang, S., et al.: Hive: harnessing human feedback for instructional visual editing. arXiv preprint arXiv:2303.09618 (2023)"},{"key":"8_CR99","unstructured":"Zhang, S., et al.: OPT: open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)"},{"key":"8_CR100","doi-asserted-by":"crossref","unstructured":"Zhang, Y., et al.: Inversion-based style transfer with diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10146\u201310156 (2023)","DOI":"10.1109\/CVPR52729.2023.00978"},{"key":"8_CR101","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Han, L., Ghosh, A., Metaxas, D.N., Ren, J.: Sine: single image editing with text-to-image diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6027\u20136037 (2023)","DOI":"10.1109\/CVPR52729.2023.00584"},{"key":"8_CR102","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: Minigpt-4: enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72673-6_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T00:04:39Z","timestamp":1732925079000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72673-6_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,22]]},"ISBN":["9783031726729","9783031726736"],"references-count":102,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72673-6_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,22]]},"assertion":[{"value":"22 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}