{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,15]],"date-time":"2026-04-15T05:43:10Z","timestamp":1776231790980,"version":"3.50.1"},"reference-count":249,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"3","license":[{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62472068"],"award-info":[{"award-number":["62472068"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Municipal Government of Quzhou","award":["2024D036"],"award-info":[{"award-number":["2024D036"]}]},{"name":"DFF Inge Lehmann","award":["4303-00014"],"award-info":[{"award-number":["4303-00014"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Knowl. Data Eng."],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1109\/tkde.2026.3651536","type":"journal-article","created":{"date-parts":[[2026,1,12]],"date-time":"2026-01-12T22:03:44Z","timestamp":1768255424000},"page":"2040-2063","source":"Crossref","is-referenced-by-count":5,"title":["Unraveling Spatio-Temporal Foundation Models via the Pipeline Lens: A Comprehensive Review"],"prefix":"10.1109","volume":"38","author":[{"given":"Yuchen","family":"Fang","sequence":"first","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9346-7133","authenticated-orcid":false,"given":"Hao","family":"Miao","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University, Hong Kong SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2817-7337","authenticated-orcid":false,"given":"Yuxuan","family":"Liang","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9377-4309","authenticated-orcid":false,"given":"Liwei","family":"Deng","sequence":"additional","affiliation":[{"name":"Aalborg University, Aalborg, Denmark"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1656-5407","authenticated-orcid":false,"given":"Yue","family":"Cui","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5871-1871","authenticated-orcid":false,"given":"Ximu","family":"Zeng","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"given":"Yuyang","family":"Xia","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0242-3707","authenticated-orcid":false,"given":"Yan","family":"Zhao","sequence":"additional","affiliation":[{"name":"Shenzhen Institute for Advanced Study, University of Electronic Science and Technology of China, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1615-777X","authenticated-orcid":false,"given":"Torben Bach","family":"Pedersen","sequence":"additional","affiliation":[{"name":"Aalborg University, Aalborg, Denmark"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9697-7670","authenticated-orcid":false,"given":"Christian S.","family":"Jensen","sequence":"additional","affiliation":[{"name":"Aalborg University, Aalborg, Denmark"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6343-1455","authenticated-orcid":false,"given":"Xiaofang","family":"Zhou","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0217-3998","authenticated-orcid":false,"given":"Kai","family":"Zheng","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2023.3333824"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/482"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2022.3197640"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3583780.3615215"},{"key":"ref6","article-title":"Scaling laws for neural language models","author":"Kaplan","year":"2020"},{"key":"ref7","first-page":"1","article-title":"LoRA: Low-rank adaptation of large language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Hu","year":"2021"},{"key":"ref8","first-page":"24824","article-title":"Chain-of-thought prompting elicits reasoning in large language models","volume-title":"Proc. 36th Int. Conf. Neural Inf. Process. Syst.","author":"Wei","year":"2022"},{"key":"ref9","article-title":"Large models for time series and spatio-temporal data: A survey and outlook","author":"Jin","year":"2023"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671451"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671453"},{"key":"ref12","article-title":"Spatio-Temporal foundation models: Vision, challenges, and opportunities","author":"Goodge","year":"2025"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3711896.3736552"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE60146.2024.00342"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2023.3323535"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/icde51399.2021.00066"},{"key":"ref17","first-page":"1","article-title":"UniTraj: Learning a universal trajectory foundation model from billion-scale worldwide traces","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Zhu","year":"2025"},{"key":"ref18","article-title":"PTR: A pre-trained language model for trajectory recovery","author":"Wei","year":"2024"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1080\/13658816.2017.1400548"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE55515.2023.00070"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE60146.2024.00337"},{"key":"ref22","volume-title":"GNSS\u2013Global Navigation Satellite Systems: GPS, GLONASS, Galileo, and More","author":"Hofmann-Wellenhof","year":"2007"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i8.28672"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2023.3347513"},{"key":"ref25","first-page":"1","article-title":"GTR: A general, multi-view, and dynamic framework for trajectory representation learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wang","year":"2025"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.14778\/3632093.3632105"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.127063"},{"key":"ref28","first-page":"1","article-title":"TrajMamba: An efficient and semantic-rich vehicle trajectory pre-training model","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Lin","year":"2025"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/3361741"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671866"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.378"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.955"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671466"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3330919"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1310.4546"},{"key":"ref36","article-title":"MIRAI: Evaluating LLM agents for event forecasting","author":"Ye","year":"2024"},{"key":"ref37","first-page":"29532","article-title":"Language models can improve event prediction by few-shot abductive reasoning","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Shi","year":"2024"},{"key":"ref38","first-page":"46238","article-title":"Latent logic tree extraction for event sequence explanation from LLMs","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Song","year":"2024"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/3589334.3645376"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i4.20330"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539427"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-86365-4_16"},{"key":"ref43","first-page":"25904","article-title":"ClimaX: A foundation model for weather and climate","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Nguyen","year":"2023"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671662"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-023-06185-3"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/3583780.3614842"},{"key":"ref47","first-page":"1","article-title":"UrbanDiT: A foundation model for open-world urban spatio-temporal learning","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Yuan","year":"2025"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00378"},{"key":"ref49","first-page":"1","article-title":"WeatherGFM: Learning a weather generalist foundation model via in-context learning","volume-title":"Proc. Int. Conf. Log. Program.","author":"Zhao","year":"2025"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671578"},{"key":"ref51","article-title":"VideoLLM: Modeling video sequence with large language models","author":"Chen","year":"2023"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2010.11929"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4321-9"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01725"},{"key":"ref55","first-page":"1","article-title":"SEINE: Short-to-long video diffusion model for generative transition and prediction","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Chen","year":"2024"},{"key":"ref56","first-page":"1","article-title":"Seer: Language instructed video prediction with latent diffusion models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Gu","year":"2024"},{"key":"ref57","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford","year":"2021"},{"key":"ref58","first-page":"124","article-title":"Zero-shot video question answering via frozen bidirectional language models","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Yang","year":"2022"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00035"},{"key":"ref60","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li","year":"2023"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01389"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2024\/282"},{"key":"ref64","first-page":"52595","article-title":"Frequency-aware generative models for multivariate time series imputation","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Yang","year":"2024"},{"key":"ref65","first-page":"26304","article-title":"Foundation model for intracranial neural signal","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Zhang","year":"2024"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539396"},{"key":"ref67","first-page":"70229","article-title":"Generative pre-training of spatio-temporal graph neural networks","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Li","year":"2023"},{"key":"ref68","first-page":"3998","article-title":"Spatial-temporal-decoupled masked pre-training for spatiotemporal forecasting","volume-title":"Proc. Int. Joint Conf. Artif. Intell.","author":"Gao","year":"2024"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1145\/3690624.3709177"},{"key":"ref70","first-page":"28978","article-title":"FlashST: A simple and universal prompt-tuning framework for traffic prediction","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li","year":"2024"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1145\/3773912"},{"key":"ref72","first-page":"69586","article-title":"PPi: Pretraining brain signal model for patient-independent seizure detection","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Yuan","year":"2024"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1145\/3627673.3680023"},{"key":"ref74","first-page":"115233","article-title":"PowerPM: Foundation model for power systems","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Tu","year":"2024"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671548"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/TSMC.2014.2327053"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1145\/2020408.2020579"},{"issue":"2","key":"ref78","first-page":"32","article-title":"GeoLife: A collaborative social networking service among user, location and trajectory","volume":"33","author":"Zheng","year":"2010","journal-title":"IEEE Data Eng. Bull."},{"key":"ref79","first-page":"5273","article-title":"OpenForecast: A large-scale open-ended event forecasting dataset","volume-title":"Proc. 31st Int. Conf. Comput. Linguistics","author":"Wang","year":"2025"},{"key":"ref80","first-page":"1","article-title":"GDELT: Global data on events, location, and tone","volume-title":"Proc. ISA Annu. Conv.","author":"Leetaru","year":"2013"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1145\/3511808.3557233"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1145\/3583780.3615016"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1038\/sdata.2016.35"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1029\/2020MS002203"},{"key":"ref85","first-page":"22009","article-title":"SEVIR: A storm event imagery dataset for deep learning applications in radar and satellite meteorology","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Veillette","year":"2020"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.3390\/rs15010102"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE53745.2022.00269"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671843"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248074"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2004.1334462"},{"key":"ref91","article-title":"UCF101: A dataset of 101 human actions classes from videos in the wild","author":"Soomro","year":"2012"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.338"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"ref95","first-page":"1","article-title":"Diffusion convolutional recurrent neural network: Data-driven traffic forecasting","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Li","year":"2017"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i01.5438"},{"key":"ref97","first-page":"75354","article-title":"LargeST: A benchmark dataset for large-scale traffic forecasting","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Liu","year":"2023"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1145\/3627673.3679806"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403118"},{"key":"ref100","first-page":"1","article-title":"Leveraging vision-language models for granular market change prediction","volume-title":"Proc. AAAI Conf. Artif. Intell. Muffin","author":"Wimmer","year":"2023"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3219822"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i12.26676"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/TAMD.2015.2431497"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1016\/j.neuroimage.2007.01.051"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1161\/01.CTR.101.23.e215"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2024.3484454"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE55515.2023.00046"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2021.3094659"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1016\/j.trc.2020.01.010"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1145\/3511808.3557308"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1145\/3589334.3645644"},{"key":"ref112","first-page":"27268","article-title":"FEDformer: Frequency enhanced decomposed transformer for long-term series forecasting","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Zhou","year":"2022"},{"key":"ref113","article-title":"TrajFM: A vehicle trajectory foundation model for region and task transferability","author":"Lin","year":"2024"},{"key":"ref114","article-title":"EEGPT: Unleashing the potential of EEG generalist foundation model by autoregressive pre-training","author":"Yue","year":"2024"},{"key":"ref115","first-page":"1","article-title":"Large brain model for learning generic representations with tremendous EEG data in BCI","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Jiang","year":"2024"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2359"},{"key":"ref117","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Ho","year":"2020"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/264"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1145\/3543507.3583304"},{"key":"ref120","first-page":"41151","article-title":"Spatial-temporal graph learning with adversarial contrastive adaptation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Zhang","year":"2023"},{"key":"ref121","first-page":"144","article-title":"Spatial structure-aware road network embedding via graph contrastive learning","volume-title":"Proc. 26th Int. Conf. Extending Database Technol.","author":"Chang","year":"2023"},{"key":"ref122","article-title":"FENGWU: Pushing the skillful global medium-range weather forecast beyond 10 days lead","author":"Chen","year":"2023"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01359"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-023-06185-3"},{"key":"ref125","first-page":"1","article-title":"FOURCASTNET: A global data-driven high-resolution weather model using adaptive fourier neural operators","volume-title":"Proc. Platform Adv. Sci. Comput. Conf.","author":"Pathak","year":"2023"},{"key":"ref126","first-page":"65168","article-title":"DiffTraj: Generating GPS trajectory with diffusion probabilistic model","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Zhu","year":"2023"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i4.25555"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2024\/223"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1145\/3627673.3679810"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1145\/3589132.3625614"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1145\/3678717.3691235"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.52202\/079017-3921"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1007\/s11390-024-3767-3"},{"key":"ref134","first-page":"29667","article-title":"PointGPT: Auto-regressively generative pre-training from point clouds","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Chen","year":"2024"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2025.3570428"},{"key":"ref136","first-page":"1","article-title":"Large brain model for learning generic representations with tremendous EEG data in BCI","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Jiang","year":"2024"},{"key":"ref137","first-page":"68082","article-title":"iVideoGPT: Interactive videoGPTs are scalable world models","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Wu","year":"2024"},{"key":"ref138","first-page":"22004","article-title":"Everything is a video: Unifying modalities through next-frame prediction","volume-title":"Proc. Int. Conf. Comput. Vis.","author":"Hudson","year":"2025"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2694"},{"key":"ref140","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","volume-title":"Proc. Conf. North Amer. Chapter Assoc. Comput. Linguistics, Hum. Lang. Technol., Vol. 1 (Long Short Papers)","author":"J","year":"2019"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"ref142","first-page":"35946","article-title":"Masked autoencoders as spatiotemporal learners","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Feichtenhofer","year":"2022"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1145\/3627673.3679989"},{"key":"ref144","first-page":"10078","article-title":"VideoMAE: Masked autoencoders are data-efficient learners for self-supervised video pre-training","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Tong","year":"2022"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00150"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20086-1_35"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE60146.2024.00099"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2022.11.151"},{"key":"ref149","first-page":"18661","article-title":"Supervised contrastive learning","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Khosla","year":"2020"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01522"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1109\/iccv48922.2021.00153"},{"key":"ref152","first-page":"5812","article-title":"Graph contrastive learning with augmentations","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"You","year":"2020"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547783"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1145\/3583780.3615065"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3127040"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2023.3324501"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1145\/3557915.3560939"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539422"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE55515.2023.00224"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.1145\/3589334.3645378"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1145\/3690624.3709209"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref163","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096169"},{"key":"ref165","first-page":"45259","article-title":"DYffusion: A dynamics-informed diffusion model for spatiotemporal forecasting","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Cachay","year":"2024"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446690"},{"key":"ref167","first-page":"107604","article-title":"Causal deciphering and inpainting in spatio-temporal dynamics via diffusion model","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Duan","year":"2024"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i1.27802"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.1145\/3580305.3599511"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE55515.2023.00150"},{"key":"ref171","first-page":"2561","article-title":"SASDIM: Self-adaptive noise scaling diffusion model for spatial time series imputation","volume-title":"Proc. Int. Joint Conf. Artif. Intell.","author":"Zhang","year":"2024"},{"key":"ref172","article-title":"PhyDA: Physics-guided diffusion models for data assimilation in atmospheric systems","author":"Wang","year":"2025"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref174","first-page":"1","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Alexey","year":"2021"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01741"},{"key":"ref177","article-title":"EVA-CLIP: Improved training techniques for clip at scale","author":"Sun","year":"2023"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1016\/j.iotcps.2023.04.003"},{"key":"ref179","article-title":"Qwen technical report","author":"Bai","year":"2023"},{"key":"ref180","article-title":"ChatGLM: A family of large language models from GLM-130b to GLM-4 all tools","author":"GLM","year":"2024"},{"key":"ref181","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-96-8180-8_28"},{"key":"ref182","doi-asserted-by":"publisher","DOI":"10.52202\/079017-3957"},{"key":"ref183","article-title":"Extracting spatiotemporal data from gradients with large language models","author":"Zheng","year":"2024"},{"key":"ref184","article-title":"Mixtral of experts","author":"Jiang","year":"2024"},{"key":"ref185","doi-asserted-by":"publisher","DOI":"10.1038\/s41746-022-00742-2"},{"key":"ref186","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-emnlp.200"},{"key":"ref187","article-title":"GATGPT: A pre-trained large language model with graph attention network for spatiotemporal imputation","author":"Chen","year":"2023"},{"key":"ref188","article-title":"TPLLM: A traffic prediction framework based on pretrained large language models","author":"Ren","year":"2024"},{"key":"ref189","article-title":"How can large language models understand spatial-temporal data","author":"Liu","year":"2024"},{"key":"ref190","first-page":"1","article-title":"Empowering pre-trained language models for spatio-temporal forecasting via decoupling enhanced discrete reprogramming","volume-title":"Proc. Int. Joint Conf. Artif. Intell.","author":"Wang","year":"2025"},{"key":"ref191","doi-asserted-by":"publisher","DOI":"10.1145\/3557915.3561026"},{"key":"ref192","article-title":"LLaMA: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"ref193","doi-asserted-by":"publisher","DOI":"10.1109\/MDM61037.2024.00025"},{"key":"ref194","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657840"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671918"},{"key":"ref196","first-page":"1","article-title":"UrbanVLP: A multi-granularity vision-language pre-trained foundation model for urban indicator prediction","volume-title":"Proc. AAAI Conf. Artif. Intell.","author":"Hao","year":"2025"},{"key":"ref197","article-title":"LLaMA-Adapter V2: Parameter-efficient visual instruction model","author":"Gao","year":"2023"},{"key":"ref198","first-page":"1","article-title":"SPHINX: The joint mixing of weights, tasks, and visual embeddings for multi-modal large language models","volume-title":"Proc. 18th Eur. Conf. Comput. Vis.","author":"Lin","year":"2024"},{"key":"ref199","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29858"},{"key":"ref200","doi-asserted-by":"publisher","DOI":"10.1007\/s10439-023-03272-4"},{"key":"ref201","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2023.3342137"},{"key":"ref202","article-title":"Where would I go next? Large language models as human mobility predictors","author":"Wang","year":"2023"},{"key":"ref203","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-industry.69"},{"key":"ref204","first-page":"1","article-title":"TEST: Text prototype aligned embedding to activate LLM\u2019s ability for time series","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Sun","year":"2024"},{"key":"ref205","first-page":"1","article-title":"Time-LLM: Time series forecasting by reprogramming large language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Jin","year":"2023"},{"key":"ref206","article-title":"Large language models are few-shot health learners","author":"Liu","year":"2023"},{"key":"ref207","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01263"},{"key":"ref208","doi-asserted-by":"publisher","DOI":"10.2139\/ssrn.4464002"},{"key":"ref209","doi-asserted-by":"publisher","DOI":"10.1145\/3627673.3680042"},{"key":"ref210","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2025.3543034"},{"key":"ref211","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i1.27758"},{"key":"ref212","doi-asserted-by":"publisher","DOI":"10.1145\/3627673.3679973"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC58415.2024.10920138"},{"key":"ref214","first-page":"1","article-title":"UrbanKGent: A unified large language model agent framework for urban knowledge graph construction","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Ning","year":"2024"},{"key":"ref215","doi-asserted-by":"publisher","DOI":"10.1145\/3726302.3729930"},{"key":"ref216","first-page":"6309","article-title":"Neural discrete representation learning","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Den","year":"2017"},{"key":"ref217","doi-asserted-by":"publisher","DOI":"10.3758\/s13423-014-0585-6"},{"key":"ref218","first-page":"25105","article-title":"VideoPoet: A large language model for zero-shot video generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kondratyuk","year":"2024"},{"key":"ref219","doi-asserted-by":"publisher","DOI":"10.1145\/3696410.3714744"},{"key":"ref220","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i18.34067"},{"key":"ref221","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.317"},{"key":"ref222","doi-asserted-by":"publisher","DOI":"10.1145\/3711896.3737164"},{"key":"ref223","doi-asserted-by":"publisher","DOI":"10.1145\/3588730"},{"key":"ref224","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671509"},{"key":"ref225","doi-asserted-by":"publisher","DOI":"10.14778\/3705829.3705859"},{"key":"ref226","first-page":"1","article-title":"CaT-GNN: Enhancing credit card fraud detection via causal temporal graph neural networks","volume-title":"Proc. COLING","author":"Duan","year":"2024"},{"key":"ref227","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i4.25552"},{"key":"ref228","first-page":"1","article-title":"Timeseries suppliers allocation risk optimization via deep black litterman model","volume-title":"Proc. AAAI Conf. Artif. Intell.","author":"Luo","year":"2025"},{"key":"ref229","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE.2016.7498228"},{"key":"ref230","doi-asserted-by":"publisher","DOI":"10.14778\/3407790.3407839"},{"key":"ref231","doi-asserted-by":"publisher","DOI":"10.1145\/3716134"},{"key":"ref232","doi-asserted-by":"publisher","DOI":"10.1109\/MDM65600.2025.00057"},{"key":"ref233","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671937"},{"key":"ref234","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-023-06160-y"},{"key":"ref235","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.580"},{"key":"ref236","doi-asserted-by":"publisher","DOI":"10.1145\/3690624.3709379"},{"key":"ref237","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00788"},{"key":"ref238","doi-asserted-by":"publisher","DOI":"10.1109\/IROS58592.2024.10802397"},{"key":"ref239","doi-asserted-by":"publisher","DOI":"10.1016\/j.watres.2024.122162"},{"key":"ref240","doi-asserted-by":"publisher","DOI":"10.1038\/s43247-023-01188-4"},{"key":"ref241","article-title":"Spatial-temporal mixture-of-graph-experts for multi-type crime prediction","author":"Wu","year":"2024"},{"key":"ref242","doi-asserted-by":"publisher","DOI":"10.1029\/2025JH000601"},{"key":"ref243","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2024.3354456"},{"key":"ref244","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2024.3374325"},{"key":"ref245","first-page":"21702","article-title":"LLM-Pruner: On the structural pruning of large language models","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Ma","year":"2023"},{"key":"ref246","doi-asserted-by":"publisher","DOI":"10.52202\/079017-1319"},{"key":"ref247","doi-asserted-by":"publisher","DOI":"10.14778\/3705829.3705841"},{"key":"ref248","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE65448.2025.00334"},{"key":"ref249","doi-asserted-by":"publisher","DOI":"10.14778\/3665844.3665863"}],"container-title":["IEEE Transactions on Knowledge and Data Engineering"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/69\/11393947\/11341909.pdf?arnumber=11341909","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,13]],"date-time":"2026-02-13T20:51:28Z","timestamp":1771015888000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11341909\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3]]},"references-count":249,"journal-issue":{"issue":"3"},"URL":"https:\/\/doi.org\/10.1109\/tkde.2026.3651536","relation":{},"ISSN":["1041-4347","1558-2191","2326-3865"],"issn-type":[{"value":"1041-4347","type":"print"},{"value":"1558-2191","type":"electronic"},{"value":"2326-3865","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,3]]}}}