{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:05:51Z","timestamp":1777655151698,"version":"3.51.4"},"publisher-location":"Cham","reference-count":52,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031928079","type":"print"},{"value":"9783031928086","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-92808-6_16","type":"book-chapter","created":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T15:59:41Z","timestamp":1748361581000},"page":"249-265","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["xGen-VideoSyn-1: High-Fidelity Text-to-Video Synthesis with\u00a0Compressed Representations"],"prefix":"10.1007","author":[{"given":"Can","family":"Qin","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Congying","family":"Xia","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Krithika","family":"Ramakrishnan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Michael","family":"Ryoo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lifu","family":"Tu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yihao","family":"Feng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Manli","family":"Shu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Honglu","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Anas","family":"Awadalla","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jun","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Senthil","family":"Purushwalkam","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Le","family":"Xue","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yingbo","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Huan","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Silvio","family":"Savarese","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Juan Carlos","family":"Niebles","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zeyuan","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ran","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Caiming","family":"Xiong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,5,12]]},"reference":[{"key":"16_CR1","unstructured":"Abdin, M., et al.: Phi-3 technical report: a highly capable language model locally on your phone. arXiv preprint arXiv:2404.14219 (2024)"},{"key":"16_CR2","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. In: Advances in Neural Information Processing Systems (2022)"},{"key":"16_CR3","doi-asserted-by":"crossref","unstructured":"Anne\u00a0Hendricks, L., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.: Localizing moments in video with natural language. In: IEEE International Conference on Computer Vision (2017)","DOI":"10.1109\/ICCV.2017.618"},{"key":"16_CR4","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: Frozen in time: a joint video and image encoder for end-to-end retrieval. In: IEEE International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"16_CR5","unstructured":"Blattmann, A., et\u00a0al.: Stable video diffusion: scaling latent video diffusion models to large datasets. arXiv preprint arXiv:2311.15127 (2023)"},{"key":"16_CR6","doi-asserted-by":"crossref","unstructured":"Blattmann, A., et al.: Align your latents: high-resolution video synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22563\u201322575 (2023)","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"16_CR7","unstructured":"Brooks, T., et al.: Video generation models as world simulators (2024). https:\/\/openai.com\/research\/video-generation-models-as-world-simulators"},{"key":"16_CR8","doi-asserted-by":"crossref","unstructured":"Buhrmester, M., Kwang, T., Gosling, S.D.: Amazon\u2019s mechanical Turk: a new source of inexpensive, yet high-quality, data? Perspectives on Psychological Science (2011)","DOI":"10.1037\/e527772014-223"},{"key":"16_CR9","doi-asserted-by":"crossref","unstructured":"Caba\u00a0Heilbron, F., Escorcia, V., Ghanem, B., Carlos\u00a0Niebles, J.: ActivityNet: a large-scale video benchmark for human activity understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 961\u2013970 (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"16_CR10","unstructured":"Chen, D., Dolan, W.B.: Collecting highly parallel data for paraphrase evaluation. In: Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies, pp. 190\u2013200 (2011)"},{"key":"16_CR11","unstructured":"Chen, J., et\u00a0al.: PixArt-alpha: fast training of diffusion transformer for photorealistic text-to-image synthesis. arXiv preprint arXiv:2310.00426 (2023)"},{"key":"16_CR12","doi-asserted-by":"crossref","unstructured":"Chen, T.S., et al.: Panda-70m: captioning 70m videos with multiple cross-modality teachers. arXiv preprint arXiv:2402.19479 (2024)","DOI":"10.1109\/CVPR52733.2024.01265"},{"issue":"3","key":"16_CR13","doi-asserted-by":"publisher","first-page":"279","DOI":"10.1117\/1.482677","volume":"8","author":"ML Comer","year":"1999","unstructured":"Comer, M.L., Delp, E.J., III.: Morphological operations for color image processing. J. Electron. Imag. 8(3), 279\u2013289 (1999)","journal-title":"J. Electron. Imag."},{"key":"16_CR14","unstructured":"Culjak, I., Abram, D., Pribanic, T., Dzapo, H., Cifrek, M.: A brief introduction to OpenCV. In: 2012 Proceedings of the 35th International Convention MIPRO, pp. 1725\u20131730. IEEE (2012)"},{"key":"16_CR15","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (2021)"},{"key":"16_CR16","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., Ommer, B.: Taming transformers for high-resolution image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12873\u201312883 (2021)","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"16_CR17","unstructured":"Ho, J., et\u00a0al.: Imagen video: high definition video generation with diffusion models. arXiv preprint arXiv:2210.02303 (2022)"},{"key":"16_CR18","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural Inf. Process. Syst. (2020)"},{"key":"16_CR19","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)"},{"key":"16_CR20","first-page":"8633","volume":"35","author":"J Ho","year":"2022","unstructured":"Ho, J., Salimans, T., Gritsenko, A., Chan, W., Norouzi, M., Fleet, D.J.: Video diffusion models. Adv. Neural. Inf. Process. Syst. 35, 8633\u20138646 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"16_CR21","doi-asserted-by":"publisher","unstructured":"Hor\u00e9, A., Ziou, D.: Image quality metrics: PSNR vs. SSIM. In: 2010 20th International Conference on Pattern Recognition, pp. 2366\u20132369 (2010). https:\/\/doi.org\/10.1109\/ICPR.2010.579","DOI":"10.1109\/ICPR.2010.579"},{"key":"16_CR22","doi-asserted-by":"crossref","unstructured":"Huang, Z., et\u00a0al.: Vbench: comprehensive benchmark suite for video generative models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21807\u201321818 (2024)","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"16_CR23","unstructured":"Jiang, A.Q., et\u00a0al.: Mixtral of experts. arXiv preprint arXiv:2401.04088 (2024)"},{"key":"16_CR24","unstructured":"Kay, W., et\u00a0al.: The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)"},{"key":"16_CR25","unstructured":"Kingma, D.P., Welling, M.: Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)"},{"key":"16_CR26","doi-asserted-by":"publisher","unstructured":"Lin, B.: Open-sora-plan (2024). https:\/\/doi.org\/10.5281\/zenodo.10948109","DOI":"10.5281\/zenodo.10948109"},{"key":"16_CR27","unstructured":"Ma, X., et al.: Latte: latent diffusion transformer for video generation. arXiv preprint arXiv:2401.03048 (2024)"},{"key":"16_CR28","unstructured":"Nan, K., et al.: OpenVid-1M: a large-scale high-quality dataset for text-to-video generation. arXiv preprint arXiv:2407.02371 (2024)"},{"key":"16_CR29","doi-asserted-by":"crossref","unstructured":"Peebles, W., Xie, S.: Scalable diffusion models with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4195\u20134205 (2023)","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"16_CR30","unstructured":"Pernias, P., Rampas, D., Richter, M.L., Pal, C.J., Aubreville, M.: Wuerstchen: an efficient architecture for large-scale text-to-image diffusion models (2023)"},{"key":"16_CR31","doi-asserted-by":"crossref","unstructured":"Piergiovanni, A., Noble, I., Kim, D., Ryoo, M.S., Gomes, V., Angelova, A.: Mirasol3B: a multimodal autoregressive model for time-aligned and contextual modalities. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2024)","DOI":"10.1109\/CVPR52733.2024.02531"},{"issue":"140","key":"16_CR32","first-page":"1","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(140), 1\u201367 (2020)","journal-title":"J. Mach. Learn. Res."},{"key":"16_CR33","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Rohrbach, M., Tandon, N., Schiele, B.: A dataset for movie description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2015)","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"16_CR34","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"16_CR35","doi-asserted-by":"crossref","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-net: convolutional networks for biomedical image segmentation. In: Medical Image Computing and Computer-Assisted Intervention\u2013MICCAI 2015: 18th International Conference, Munich, Germany, October 5-9, 2015, Proceedings, part III 18, pp. 234\u2013241. Springer (2015)","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"16_CR36","doi-asserted-by":"crossref","unstructured":"Ryoo, M.S., et al.: Token turing machines. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.01828"},{"key":"16_CR37","doi-asserted-by":"crossref","unstructured":"Sauer, A., Boesel, F., Dockhorn, T., Blattmann, A., Esser, P., Rombach, R.: Fast high-resolution image synthesis with latent adversarial diffusion distillation. arXiv preprint arXiv:2403.12015 (2024)","DOI":"10.1145\/3680528.3687625"},{"key":"16_CR38","unstructured":"Singer, U., et\u00a0al.: Make-a-video: text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792 (2022)"},{"key":"16_CR39","doi-asserted-by":"publisher","first-page":"127063","DOI":"10.1016\/j.neucom.2023.127063","volume":"568","author":"J Su","year":"2024","unstructured":"Su, J., Ahmed, M., Lu, Y., Pan, S., Bo, W., Liu, Y.: Roformer: enhanced transformer with rotary position embedding. Neurocomputing 568, 127063 (2024)","journal-title":"Neurocomputing"},{"key":"16_CR40","unstructured":"Van Den\u00a0Oord, A., Vinyals, O., et\u00a0al.: Neural discrete representation learning. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"16_CR41","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"16_CR42","unstructured":"Wang, J., Yuan, H., Chen, D., Zhang, Y., Wang, X., Zhang, S.: Modelscope text-to-video technical report. arXiv preprint arXiv:2308.06571 (2023)"},{"key":"16_CR43","doi-asserted-by":"crossref","unstructured":"Wang, X., Wu, J., Chen, J., Li, L., Wang, Y.F., Wang, W.Y.: Vatex: a large-scale, high-quality multilingual dataset for video-and-language research. In: IEEE International Conference on Computer Vision (2019)","DOI":"10.1109\/ICCV.2019.00468"},{"key":"16_CR44","unstructured":"Wang, Y., et\u00a0al.: Lavie: high-quality video generation with cascaded latent diffusion models. arXiv preprint arXiv:2309.15103 (2023)"},{"key":"16_CR45","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: MSR-VTT: a large video description dataset for bridging video and language. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"16_CR46","unstructured":"Xue, L., et\u00a0al.: xGen-MM (BLIP-3): a family of open large multimodal models. arXiv preprint arXiv:2408.08872 (2024)"},{"key":"16_CR47","unstructured":"Yan, W., Zhang, Y., Abbeel, P., Srinivas, A.: VideoGPT: video generation using VQ-VAE and transformers. arXiv preprint arXiv:2104.10157 (2021)"},{"key":"16_CR48","doi-asserted-by":"crossref","unstructured":"Yu, L., et\u00a0al.: MAGVIT: masked generative video transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10459\u201310469 (2023)","DOI":"10.1109\/CVPR52729.2023.01008"},{"key":"16_CR49","unstructured":"Yu, L., et\u00a0al.: Language model beats diffusion\u2013tokenizer is key to visual generation. arXiv preprint arXiv:2310.05737 (2023)"},{"key":"16_CR50","unstructured":"Zhang, R., et al.: Direct preference optimization of video large multimodal models from language model reward (2024)"},{"key":"16_CR51","unstructured":"Zheng, Z., et al.: Open-sora: democratizing efficient video production for all (2024). https:\/\/github.com\/hpcaitech\/Open-Sora"},{"key":"16_CR52","doi-asserted-by":"crossref","unstructured":"Zhou, L., Xu, C., Corso, J.: Towards automatic learning of procedures from web instructional videos. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a032 (2018)","DOI":"10.1609\/aaai.v32i1.12342"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-92808-6_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T15:59:57Z","timestamp":1748361597000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-92808-6_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031928079","9783031928086"],"references-count":52,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-92808-6_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"12 May 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}