{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,15]],"date-time":"2026-01-15T22:13:40Z","timestamp":1768515220166,"version":"3.49.0"},"reference-count":76,"publisher":"Springer Science and Business Media LLC","issue":"7","license":[{"start":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T00:00:00Z","timestamp":1740096000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T00:00:00Z","timestamp":1740096000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,7]]},"DOI":"10.1007\/s11263-025-02388-5","type":"journal-article","created":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T06:41:42Z","timestamp":1740120102000},"page":"4112-4133","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Bootstrapping Vision-Language Models for Frequency-Centric Self-Supervised Remote Physiological Measurement"],"prefix":"10.1007","volume":"133","author":[{"given":"Zijie","family":"Yue","sequence":"first","affiliation":[]},{"given":"Miaojing","family":"Shi","sequence":"additional","affiliation":[]},{"given":"Hanli","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Shuai","family":"Ding","sequence":"additional","affiliation":[]},{"given":"Qijun","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Shanlin","family":"Yang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,2,21]]},"reference":[{"key":"2388_CR1","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., et\u00a0al. (2021). Frozen in time: A joint video and image encoder for end-to-end retrieval. In: ICCV.","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"2388_CR2","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1016\/j.patrec.2017.10.017","volume":"124","author":"S Bobbia","year":"2019","unstructured":"Bobbia, S., Macwan, R., Benezeth, Y., et al. (2019). Unsupervised skin tissue segmentation for remote photoplethysmography. Pattern Recognition Letters, 124, 82\u201390.","journal-title":"Pattern Recognition Letters"},{"key":"2388_CR3","doi-asserted-by":"crossref","unstructured":"Chatterjee, A., Gokhale, T., Baral, C., et\u00a0al. (2024). On the robustness of language guidance for low-level vision tasks: Findings from depth estimation. In: CVPR.","DOI":"10.1109\/CVPR52733.2024.00270"},{"key":"2388_CR4","unstructured":"Chen, S., Ge, C., Tong, Z., et\u00a0al. (2022). Adaptformer: Adapting vision transformers for scalable visual recognition. In: NeurIPS."},{"issue":"10","key":"2388_CR5","doi-asserted-by":"publisher","first-page":"3600","DOI":"10.1109\/TIM.2018.2879706","volume":"68","author":"X Chen","year":"2019","unstructured":"Chen, X., Cheng, J., Song, R., et al. (2019). Video-based heart rate measurement: Recent advances and future prospects. IEEE Transactions on Instrumentation and Measurement, 68(10), 3600\u20133615.","journal-title":"IEEE Transactions on Instrumentation and Measurement"},{"key":"2388_CR6","doi-asserted-by":"crossref","unstructured":"Cheng, F., Wang, X., Lei, J., et\u00a0al. (2023). VindLU: A recipe for effective video-and-language pretraining. In: CVPR.","DOI":"10.1109\/CVPR52729.2023.01034"},{"issue":"10","key":"2388_CR7","doi-asserted-by":"publisher","first-page":"2878","DOI":"10.1109\/TBME.2013.2266196","volume":"60","author":"G De Haan","year":"2013","unstructured":"De Haan, G., & Jeanne, V. (2013). Robust pulse rate from chrominance-basedrPPG. IEEE Transactions on Biomedical Engineering, 60(10), 2878\u20132886.","journal-title":"IEEE Transactions on Biomedical Engineering"},{"key":"2388_CR8","unstructured":"Devlin, J., Chang, M.W., Lee, K., et\u00a0al. (2018). BERT: Pre-training of deep bidirectional transformers for language understanding. arXiv:1810.04805"},{"key":"2388_CR9","doi-asserted-by":"crossref","unstructured":"Du, J., Liu, S.Q., Zhang, B., et\u00a0al. (2023). Dual-bridging with adversarial noise generation for domain adaptive rPPG estimation. In: CVPR.","DOI":"10.1109\/CVPR52729.2023.00998"},{"key":"2388_CR10","doi-asserted-by":"crossref","unstructured":"Gideon, J., & Stent, S. (2021). The way to my heart is through contrastive learning: Remote photoplethysmography from unlabelled video. In: ICCV.","DOI":"10.1109\/ICCV48922.2021.00396"},{"key":"2388_CR11","doi-asserted-by":"crossref","unstructured":"Guo, J., Zhu, X., Yang, Y., et\u00a0al. (2020). Towards fast, accurate and stable 3d dense face alignment. In: ECCV.","DOI":"10.1007\/978-3-030-58529-7_10"},{"key":"2388_CR12","doi-asserted-by":"crossref","unstructured":"Gupta, A.K., Kumar, R., Birla, L., et\u00a0al. (2023). RADIANT: Better rPPG estimation using signal embeddings and transformer. In: WACV.","DOI":"10.1109\/WACV56688.2023.00495"},{"key":"2388_CR13","doi-asserted-by":"crossref","unstructured":"Jia, M., Tang, L., Chen, B.C., et\u00a0al. (2022). Visual prompt tuning. In: ECCV.","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"2388_CR14","doi-asserted-by":"crossref","unstructured":"Jiang, R., Liu, L., Chen, C. (2023). CLIP-count: Towards text-guided zero-shot object counting. In: ACM MM.","DOI":"10.1145\/3581783.3611789"},{"issue":"1","key":"2388_CR15","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1038\/s41746-021-00543-z","volume":"5","author":"J Jorge","year":"2022","unstructured":"Jorge, J., Villarroel, M., Tomlinson, H., et al. (2022). Non-contact physiological monitoring of post-operative patients in the intensive care unit. npj Digital Medicine, 5(1), 4.","journal-title":"npj Digital Medicine"},{"issue":"1","key":"2388_CR16","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., Zhu, Y., Groth, O., et al. (2017). Visual genome: Connecting language and vision using crowdsourced dense image annotations. International Journal of Computer Vision, 123(1), 32\u201373.","journal-title":"International Journal of Computer Vision"},{"key":"2388_CR17","unstructured":"Kwon, G., Cai, Z., Ravichandran, A., et\u00a0al. (2023). Masked vision and language modeling for multi-modal representation learning. In: ICLR."},{"key":"2388_CR18","doi-asserted-by":"crossref","unstructured":"Lam, A., & Kuno, Y. (2015). Robust heart rate measurement from video using select random patches. In: ICCV.","DOI":"10.1109\/ICCV.2015.415"},{"key":"2388_CR19","doi-asserted-by":"crossref","unstructured":"Lee, E., Chen, E., & Lee, C.Y. (2020a). Meta-rPPG: Remote heart rate estimation using a transductive meta-learner. In: ECCV.","DOI":"10.1007\/978-3-030-58583-9_24"},{"key":"2388_CR20","doi-asserted-by":"crossref","unstructured":"Li, Z., & Yin, L. (2023). Contactless pulse estimation leveraging pseudo labels and self-supervision. In: ICCV.","DOI":"10.1109\/ICCV51070.2023.01882"},{"key":"2388_CR21","unstructured":"Li, J., Li, D., Savarese, S., et\u00a0al. (2023a). BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In: ICML."},{"key":"2388_CR22","unstructured":"Li, J., Li, D., Xiong, C., et\u00a0al. (2022). BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: ICML."},{"key":"2388_CR23","unstructured":"Li, J., Selvaraju, R.R., Gotmare, A.D., et\u00a0al. (2021). Align before fuse: Vision and language representation learning with momentum distillation. In: NeurIPS."},{"key":"2388_CR24","doi-asserted-by":"crossref","unstructured":"Li, J., Yu, Z., & Shi, J. (2023b). Learning motion-robust remote photoplethysmography through arbitrary resolution videos. In: AAAI.","DOI":"10.1609\/aaai.v37i1.25217"},{"key":"2388_CR25","doi-asserted-by":"crossref","unstructured":"Liang, D., Xie, J., Zou, Z., et\u00a0al. (2023). CrowdCLIP: unsupervised crowd counting via vision-language model. In: CVPR.","DOI":"10.1109\/CVPR52729.2023.00283"},{"key":"2388_CR26","doi-asserted-by":"crossref","unstructured":"Liu, Y., Jourabloo, A., & Liu, X. (2018). Learning deep models for face anti-spoofing: binary or auxiliary supervision. In: CVPR.","DOI":"10.1109\/CVPR.2018.00048"},{"key":"2388_CR27","doi-asserted-by":"publisher","first-page":"27","DOI":"10.1109\/TIP.2023.3330108","volume":"33","author":"SQ Liu","year":"2024","unstructured":"Liu, S. Q., & Yuen, P. C. (2024). Robust remote photoplethysmography estimation with environmental noise disentanglement. IEEE Transactions on Image Processing, 33, 27\u201341.","journal-title":"IEEE Transactions on Image Processing"},{"issue":"663","key":"2388_CR28","doi-asserted-by":"publisher","first-page":"eadc9669","DOI":"10.1126\/scitranslmed.adc9669","volume":"14","author":"Y Liu","year":"2022","unstructured":"Liu, Y., Zhang, G., Tarolli, C. G., et al. (2022). Monitoring gait at home with radio waves in Parkinson\u2019s disease: A marker of severity, progression, and medication response. Science Translational Medicine, 14(663), eadc9669.","journal-title":"Science Translational Medicine"},{"key":"2388_CR29","doi-asserted-by":"publisher","first-page":"7278","DOI":"10.1109\/TMM.2024.3363660","volume":"26","author":"X Liu","year":"2024","unstructured":"Liu, X., Zhang, Y., Yu, Z., et al. (2024). rPPG-MAE: Self-supervised pretraining with masked autoencoders for remote physiological measurements. IEEE Transactions on Multimedia, 26, 7278\u20137293.","journal-title":"IEEE Transactions on Multimedia"},{"key":"2388_CR30","unstructured":"Loshchilov, I., & Hutter, F. (2019). Decoupled weight decay regularization. In: ICLR."},{"key":"2388_CR31","doi-asserted-by":"crossref","unstructured":"Lu, H., Han, H., & Zhou, S.K. (2021). Dual-GAN: Joint BVP and noise modeling for remote physiological measurement. In: CVPR.","DOI":"10.1109\/CVPR46437.2021.01222"},{"key":"2388_CR32","doi-asserted-by":"crossref","unstructured":"Lu, H., Yu, Z., Niu, X., et\u00a0al. (2023). Neuron structure modeling for generalizable remote physiological measurement. In: CVPR.","DOI":"10.1109\/CVPR52729.2023.01783"},{"issue":"9","key":"2388_CR33","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3558518","volume":"55","author":"D McDuff","year":"2023","unstructured":"McDuff, D. (2023). Camera measurement of physiological vital signs. ACM Computing Surveys, 55(9), 1\u201340.","journal-title":"ACM Computing Surveys"},{"key":"2388_CR34","doi-asserted-by":"crossref","unstructured":"Niu, X., Han, H., Shan, S., et\u00a0al. (2018). SynRhythm: Learning a deep heart rate estimator from general to specific. In: ICPR.","DOI":"10.1109\/ICPR.2018.8546321"},{"key":"2388_CR35","doi-asserted-by":"crossref","unstructured":"Niu, X., Yu, Z., Han, H., et\u00a0al. (2020b). Video-based remote physiological measurement via cross-verified feature disentangling. In: ECCV.","DOI":"10.1007\/978-3-030-58536-5_18"},{"key":"2388_CR36","doi-asserted-by":"publisher","first-page":"2409","DOI":"10.1109\/TIP.2019.2947204","volume":"29","author":"X Niu","year":"2020","unstructured":"Niu, X., Shan, S., Han, H., et al. (2020). RhythmNet: End-to-end heart rate estimation from face via spatial-temporal representation. IEEE Transactions on Image Processing, 29, 2409\u20132423.","journal-title":"IEEE Transactions on Image Processing"},{"key":"2388_CR37","unstructured":"Ordonez, V., Kulkarni, G., & Berg, T.L. (2011). Im2Text: Describing images using 1 million captioned photographs. In: NeurIPS."},{"key":"2388_CR38","doi-asserted-by":"crossref","unstructured":"Pramanick, S., Song, Y., Nag, S., et\u00a0al. (2023). EgoVLPv2: Egocentric video-language pre-training with fusion in the backbone. In: ICCV.","DOI":"10.1109\/ICCV51070.2023.00487"},{"key":"2388_CR39","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TCSS.2024.3356713","volume":"11","author":"W Qian","year":"2024","unstructured":"Qian, W., Guo, D., Li, K., et al. (2024). Dual-path token learner for remote photoplethysmography-based physiological measurement with facial videos. IEEE Transactions on Computational Social Systems, 11, 1\u201313.","journal-title":"IEEE Transactions on Computational Social Systems"},{"issue":"7","key":"2388_CR40","doi-asserted-by":"publisher","first-page":"1778","DOI":"10.1109\/TMM.2018.2883866","volume":"21","author":"Y Qiu","year":"2019","unstructured":"Qiu, Y., Liu, Y., Arteaga-Falconi, J., et al. (2019). EVM-CNN: Real-time contactless heart rate estimation from facial video. IEEE Transactions on Multimedia, 21(7), 1778\u20131787.","journal-title":"IEEE Transactions on Multimedia"},{"key":"2388_CR41","unstructured":"Radford, A., Kim, J.W., Hallacy, C., et\u00a0al. (2021). Learning transferable visual models from natural language supervision. In: ICML."},{"key":"2388_CR42","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., et\u00a0al. (2018). Conceptual captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics.","DOI":"10.18653\/v1\/P18-1238"},{"key":"2388_CR43","first-page":"1","volume":"133","author":"H Shi","year":"2024","unstructured":"Shi, H., Dao, S. D., & Cai, J. (2024). Llmformer: large language model for open-vocabulary semantic segmentation. International Journal of Computer Vision, 133, 1\u201318.","journal-title":"International Journal of Computer Vision"},{"key":"2388_CR44","unstructured":"S\u00f8nderby, C.K., Espeholt, L., Heek, J., et\u00a0al. (2020). Metnet: A neural weather model for precipitation forecasting. arXiv preprint arXiv:2003.12140"},{"issue":"5","key":"2388_CR45","doi-asserted-by":"publisher","first-page":"1373","DOI":"10.1109\/JBHI.2021.3051176","volume":"25","author":"R Song","year":"2021","unstructured":"Song, R., Chen, H., Cheng, J., et al. (2021). PulseGAN: Learning to generate realistic pulse waveforms in remote photoplethysmography. IEEE Journal of Biomedical and Health Informatics, 25(5), 1373\u20131384.","journal-title":"IEEE Journal of Biomedical and Health Informatics"},{"key":"2388_CR46","doi-asserted-by":"crossref","unstructured":"Speth, J., Vance, N., Flynn, P., et\u00a0al. (2023). Non-contrastive unsupervised learning of physiological signals from video. In: CVPR.","DOI":"10.1109\/CVPR52729.2023.01390"},{"key":"2388_CR47","doi-asserted-by":"crossref","unstructured":"Stricker, R., Muller, S., & Gross, H.M. (2014). Non-contact video-based pulse rate measurement on a mobile service robot. In: IEEE international symposium on robot and human interactive communication.","DOI":"10.1109\/ROMAN.2014.6926392"},{"key":"2388_CR48","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TPAMI.2023.3345866","volume":"46","author":"Z Sun","year":"2024","unstructured":"Sun, Z., & Li, X. (2024). Contrast-Phys+: Unsupervised and weakly-supervised video-based remote physiological measurement via spatiotemporal contrast. IEEE Transactions on Pattern Analysis and Machine Intelligence, 46, 1\u201318.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"issue":"1","key":"2388_CR49","doi-asserted-by":"publisher","first-page":"32","DOI":"10.5334\/jors.241","volume":"7","author":"P Van Gent","year":"2019","unstructured":"Van Gent, P., Farah, H., Van Nes, N., et al. (2019). Analysing noisy driver physiology real-time using off-the-shelf sensors: Heart rate analysis software from the taking the fast lane project. Journal of Open Research Software, 7(1), 32.","journal-title":"Journal of Open Research Software"},{"key":"2388_CR50","doi-asserted-by":"crossref","unstructured":"Varma, M., Delbrouck, J.B., Hooper, S., et\u00a0al. (2023). ViLLA: Fine-grained vision-language representation learning from real-world data. In: ICCV.","DOI":"10.1109\/ICCV51070.2023.02031"},{"key":"2388_CR51","doi-asserted-by":"crossref","unstructured":"Verkruysse, W., Svaasand, L. O., & Nelson, J. S. (2008). Remote plethysmographic imaging using ambient light. Optics Express, 16(26), 21434.","DOI":"10.1364\/OE.16.021434"},{"key":"2388_CR52","doi-asserted-by":"crossref","unstructured":"Wang, H., Ahn, E., & Kim, J. (2022a). Self-supervised representation learning framework for remote physiological measurement using spatiotemporal augmentation loss. In: AAAI.","DOI":"10.1609\/aaai.v36i2.20143"},{"key":"2388_CR53","doi-asserted-by":"crossref","unstructured":"Wang, H., Ahn, E., & Kim, J. (2022b). Self-supervised representation learning framework for remote physiological measurement using spatiotemporal augmentation loss. In: AAAI.","DOI":"10.1609\/aaai.v36i2.20143"},{"key":"2388_CR54","doi-asserted-by":"crossref","unstructured":"Wang, J., Ge, Y., Yan, R., et\u00a0al. (2023). All in One: Exploring unified video-language pre-training. In: CVPR.","DOI":"10.1109\/CVPR52729.2023.00638"},{"issue":"7","key":"2388_CR55","doi-asserted-by":"publisher","first-page":"1479","DOI":"10.1109\/TBME.2016.2609282","volume":"64","author":"W Wang","year":"2017","unstructured":"Wang, W., Den Brinker, A. C., Stuijk, S., et al. (2017). Algorithmic principles of remote PPG. IEEE Transactions on Biomedical Engineering, 64(7), 1479\u20131491.","journal-title":"IEEE Transactions on Biomedical Engineering"},{"issue":"9","key":"2388_CR56","doi-asserted-by":"publisher","first-page":"1974","DOI":"10.1109\/TBME.2015.2508602","volume":"63","author":"W Wang","year":"2016","unstructured":"Wang, W., Stuijk, S., & de Haan, G. (2016). A novel algorithm for remote photoplethysmography: Spatial subspace rotation. IEEE Transactions on Biomedical Engineering, 63(9), 1974\u20131984.","journal-title":"IEEE Transactions on Biomedical Engineering"},{"key":"2388_CR57","doi-asserted-by":"crossref","unstructured":"Xu, H., Ghosh, G., Huang, P.Y., et\u00a0al. (2021). VideoCLIP: Contrastive pre-training for zero-shot video-text understanding. In: EMNLP.","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"2388_CR58","unstructured":"Xue, H., Sun, Y., Liu, B., et\u00a0al. (2023). CLIP-ViP: Adapting pre-trained image-text model to video-language representation alignment. In: ICLR."},{"key":"2388_CR59","unstructured":"Yang, Y., Liu, X., Wu, J., et\u00a0al. (2023). SimPer: Simple self-supervised learning of periodic targets. In: ICLR."},{"issue":"10","key":"2388_CR60","doi-asserted-by":"publisher","first-page":"2207","DOI":"10.1038\/s41591-022-01932-x","volume":"28","author":"Y Yang","year":"2022","unstructured":"Yang, Y., Yuan, Y., Zhang, G., et al. (2022). Artificial intelligence-enabled detection and assessment of Parkinson\u2019s disease using nocturnal breathing signals. Nature Medicine, 28(10), 2207\u20132215.","journal-title":"Nature Medicine"},{"issue":"1","key":"2388_CR61","doi-asserted-by":"publisher","first-page":"105","DOI":"10.1001\/jamacardio.2019.4004","volume":"5","author":"BP Yan","year":"2020","unstructured":"Yan, B. P., Lai, W. H. S., Chan, C. K. Y., et al. (2020). High-throughput, contact-free detection of atrial fibrillation from video with deep learning. JAMA Cardiology, 5(1), 105.","journal-title":"JAMA Cardiology"},{"key":"2388_CR62","doi-asserted-by":"crossref","unstructured":"Ye, Q. (2023). HiTeA: Hierarchical temporal-aware video-language pre-training. In: ICCV.","DOI":"10.1109\/ICCV51070.2023.01413"},{"key":"2388_CR63","doi-asserted-by":"crossref","unstructured":"Yu, Z., Peng, W., Li, X., et\u00a0al. (2019). Remote heart rate measurement from highly compressed facial videos: An end-to-end deep learning solution with video enhancement. In: ICCV.","DOI":"10.1109\/ICCV.2019.00024"},{"key":"2388_CR64","doi-asserted-by":"crossref","unstructured":"Yu, Z., Shen, Y., Shi, J., et\u00a0al. (2022b). PhysFormer: Facial video-based physiological measurement with temporal difference transformer. In: CVPR.","DOI":"10.1109\/CVPR52688.2022.00415"},{"key":"2388_CR65","unstructured":"Yu, J., Wang, Z., Vasudevan, V., et\u00a0al. (2022a). CoCa: Contrastive captioners are image-text foundation models. arXiv:2205.01917"},{"key":"2388_CR66","doi-asserted-by":"crossref","unstructured":"Yuan, L., Shi, M., Yue, Z., et\u00a0al. (2024). Losh: Long-short text joint prediction network for referring video object segmentation. In: CVPR.","DOI":"10.1109\/CVPR52733.2024.01328"},{"key":"2388_CR67","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TIM.2021.3109398","volume":"70","author":"Z Yue","year":"2021","unstructured":"Yue, Z., Ding, S., Yang, S., et al. (2021). Deep super-resolution network for rPPG information recovery and noncontact heart rate estimation. IEEE Transactions on Instrumentation and Measurement, 70, 1\u201311.","journal-title":"IEEE Transactions on Instrumentation and Measurement"},{"key":"2388_CR68","first-page":"1","volume":"71","author":"Z Yue","year":"2022","unstructured":"Yue, Z., Ding, S., Yang, S., et al. (2022). Multimodal information fusion approach for noncontact heart rate estimation using facial videos and graph convolutional network. IEEE Transactions on Instrumentation and Measurement, 71, 1\u201313.","journal-title":"IEEE Transactions on Instrumentation and Measurement"},{"key":"2388_CR69","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TPAMI.2023.3298650","volume":"45","author":"Z Yue","year":"2023","unstructured":"Yue, Z., Shi, M., & Ding, S. (2023). Facial video-based remote physiological measurement via self-supervised learning. IEEE Transactions on Pattern Analysis and Machine Intelligence, 45, 1\u201316.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"issue":"6","key":"2388_CR70","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1109\/MSP.2021.3106285","volume":"38","author":"Z Yu","year":"2021","unstructured":"Yu, Z., Li, X., & Zhao, G. (2021). Facial-video-based physiological signal measurement: Recent advances and affective applications. IEEE Signal Processing Magazine, 38(6), 50\u201358.","journal-title":"IEEE Signal Processing Magazine"},{"key":"2388_CR71","first-page":"1","volume":"133","author":"Y Zang","year":"2024","unstructured":"Zang, Y., Li, W., Han, J., et al. (2024). Contextual object detection with multimodal large language models. International Journal of Computer Vision, 133, 1\u201319.","journal-title":"International Journal of Computer Vision"},{"key":"2388_CR72","doi-asserted-by":"crossref","unstructured":"Zhang, R., Zeng, Z., Guo, Z., et\u00a0al. (2022). Can language understand depth? In: ACM MM.","DOI":"10.1145\/3503161.3549201"},{"key":"2388_CR73","unstructured":"Zhou, Z., Alabi, O., Wei, M., et\u00a0al. (2024a). Text promptable surgical instrument segmentation with vision-language models. In: NeurIPS."},{"key":"2388_CR74","doi-asserted-by":"crossref","unstructured":"Zhou, Z., Zhu, Z., Caesar, H., et\u00a0al. (2024b). Openpsg: Open-set panoptic scene graph generation via large multimodal models. In: ECCV.","DOI":"10.1007\/978-3-031-72684-2_12"},{"issue":"9","key":"2388_CR75","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C. C., et al. (2022). Learning to prompt for vision-language models. International Journal of Computer Vision, 130(9), 2337\u20132348.","journal-title":"International Journal of Computer Vision"},{"key":"2388_CR76","unstructured":"Zhu, D., Chen, J., Shen, X., et\u00a0al. (2023). MiniGPT-4: Enhancing vision-language understanding with advanced large language models. arXiv:2304.10592"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02388-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02388-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02388-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,7]],"date-time":"2025-06-07T06:00:42Z","timestamp":1749276042000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02388-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,21]]},"references-count":76,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2025,7]]}},"alternative-id":["2388"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02388-5","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,2,21]]},"assertion":[{"value":"27 August 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 February 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 February 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}