{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T05:56:40Z","timestamp":1777874200493,"version":"3.51.4"},"reference-count":73,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neurocomputing"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1016\/j.neucom.2026.133584","type":"journal-article","created":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T23:27:41Z","timestamp":1775604461000},"page":"133584","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["GazeCLIP: Enhancing gaze estimation through text-guided multimodal learning"],"prefix":"10.1016","volume":"685","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2613-2752","authenticated-orcid":false,"given":"Jun","family":"Wang","sequence":"first","affiliation":[]},{"given":"Hao","family":"Ruan","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0000-3493-6403","authenticated-orcid":false,"given":"Liangjian","family":"Wen","sequence":"additional","affiliation":[]},{"given":"Yong","family":"Dai","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7346-1110","authenticated-orcid":false,"given":"Mingjie","family":"Wang","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.neucom.2026.133584_bib0005","author":"Bao"},{"key":"10.1016\/j.neucom.2026.133584_bib0010","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"3143","article-title":"Appearance-based gaze estimation using attention and difference mechanism","author":"Biswas","year":"2021"},{"key":"10.1016\/j.neucom.2026.133584_bib0015","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"22035","article-title":"Source-free adaptive gaze estimation by uncertainty reduction","author":"Cai","year":"2023"},{"key":"10.1016\/j.neucom.2026.133584_bib0020","series-title":"ACM Symposium on Eye Tracking Research and Applications","first-page":"1","article-title":"Deep semantic gaze embedding and scanpath comparison for expertise classification during OPT viewing","author":"Castner","year":"2020"},{"key":"10.1016\/j.neucom.2026.133584_bib0025","series-title":"Asian Conference on Computer Vision","first-page":"309","article-title":"Appearance-based gaze estimation using dilated-convolutions","author":"Chen","year":"2018"},{"key":"10.1016\/j.neucom.2026.133584_bib0030","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"436","article-title":"PureGaze: purifying gaze feature for generalizable gaze estimation","author":"Cheng","year":"2022"},{"key":"10.1016\/j.neucom.2026.133584_bib0035","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"10623","article-title":"A coarse-to-fine adaptive network for appearance-based gaze estimation","author":"Cheng","year":"2020"},{"key":"10.1016\/j.neucom.2026.133584_bib0040","series-title":"2022 26th International Conference on Pattern Recognition (ICPR)","first-page":"3341","article-title":"Gaze estimation using transformer","author":"Cheng","year":"2022"},{"key":"10.1016\/j.neucom.2026.133584_bib0045","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"20632","article-title":"DVGaze: dual-view gaze estimation","author":"Cheng","year":"2023"},{"key":"10.1016\/j.neucom.2026.133584_bib0050","series-title":"Proceedings of the European Conference on Computer Vision (ECCV)","first-page":"100","article-title":"Appearance-based gaze estimation via evaluation-guided asymmetric regression","author":"Cheng","year":"2018"},{"key":"10.1016\/j.neucom.2026.133584_bib0055","doi-asserted-by":"crossref","first-page":"7509","DOI":"10.1109\/TPAMI.2024.3393571","article-title":"Appearance-based gaze estimation with deep learning: a review and benchmark","volume":"46","author":"Cheng","year":"2024","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.neucom.2026.133584_bib0060","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"23891","article-title":"3D prior is all you need: cross-task few-shot 2D gaze estimation","author":"Cheng","year":"2025"},{"key":"10.1016\/j.neucom.2026.133584_bib0065","doi-asserted-by":"crossref","first-page":"5259","DOI":"10.1109\/TIP.2020.2982828","article-title":"Gaze estimation by exploring two-eye asymmetry","volume":"29","author":"Cheng","year":"2020","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.neucom.2026.133584_bib0070","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"1556","article-title":"What do you see in vehicle? Comprehensive vision solution for in-vehicle gaze estimation","author":"Cheng","year":"2024"},{"key":"10.1016\/j.neucom.2026.133584_bib0075","author":"Devlin"},{"key":"10.1016\/j.neucom.2026.133584_bib0080","author":"Dosovitskiy"},{"key":"10.1016\/j.neucom.2026.133584_bib0085","series-title":"Proceedings of the European Conference on Computer Vision (ECCV)","first-page":"334","article-title":"RT-GENE: real-time eye gaze estimation in natural environments","author":"Fischer","year":"2018"},{"key":"10.1016\/j.neucom.2026.133584_bib0090","series-title":"Proceedings of the Symposium on Eye Tracking Research and Applications","first-page":"255","article-title":"Eyediap: a database for the development and evaluation of gaze estimation algorithms from RGB and RGB-d cameras","author":"Funes Mora","year":"2014"},{"key":"10.1016\/j.neucom.2026.133584_bib0095","series-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision","first-page":"3223","article-title":"MTGLS: multi-task gaze estimation with limited supervision","author":"Ghosh","year":"2022"},{"key":"10.1016\/j.neucom.2026.133584_bib0100","doi-asserted-by":"crossref","first-page":"1124","DOI":"10.1109\/TBME.2005.863952","article-title":"General theory of remote gaze estimation using the pupil center and corneal reflections","volume":"53","author":"Guestrin","year":"2006","journal-title":"IEEE Trans. Biomed. Eng."},{"key":"10.1016\/j.neucom.2026.133584_bib0105","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"key":"10.1016\/j.neucom.2026.133584_bib0110","series-title":"International Conference on Image Analysis and Recognition","first-page":"368","article-title":"Slam-based multistate tracking system for mobile human-robot interaction","author":"Hempel","year":"2020"},{"key":"10.1016\/j.neucom.2026.133584_bib0115","author":"Hong"},{"key":"10.1016\/j.neucom.2026.133584_bib0120","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"6912","article-title":"Gaze360: physically unconstrained gaze estimation in the wild","author":"Kellnhofer","year":"2019"},{"key":"10.1016\/j.neucom.2026.133584_bib0125","series-title":"International Conference on Machine Learning","first-page":"5583","article-title":"VILT: vision-and-language transformer without convolution or region supervision","author":"Kim","year":"2021"},{"key":"10.1016\/j.neucom.2026.133584_bib0130","author":"Kingma"},{"key":"10.1016\/j.neucom.2026.133584_bib0135","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"2176","article-title":"Eye tracking for everyone","author":"Krafka","year":"2016"},{"key":"10.1016\/j.neucom.2026.133584_bib0140","article-title":"Imagenet classification with deep convolutional neural networks","volume":"25","author":"Krizhevsky","year":"2012","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.neucom.2026.133584_bib0145","doi-asserted-by":"crossref","first-page":"2278","DOI":"10.1109\/5.726791","article-title":"Gradient-based learning applied to document recognition","volume":"86","author":"LeCun","year":"1998","journal-title":"Proc. IEEE"},{"key":"10.1016\/j.neucom.2026.133584_bib0150","series-title":"International Conference on Machine Learning","first-page":"12888","article-title":"BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation","author":"Li","year":"2022"},{"key":"10.1016\/j.neucom.2026.133584_bib0155","first-page":"9694","article-title":"Align before fuse: vision and language representation learning with momentum distillation","volume":"34","author":"Li","year":"2021","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.neucom.2026.133584_bib0160","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"10965","article-title":"Grounded language-image pre-training","author":"Li","year":"2022"},{"key":"10.1016\/j.neucom.2026.133584_bib0165","first-page":"35313","article-title":"Ordinalclip: learning rank prompts for language-guided ordinal regression","volume":"35","author":"Li","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.neucom.2026.133584_bib0170","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"2893","article-title":"Crowdclip: unsupervised crowd counting via vision-language model","author":"Liang","year":"2023"},{"key":"10.1016\/j.neucom.2026.133584_bib0175","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"3621","article-title":"Test-time personalization with meta prompt for gaze estimation","author":"Liu","year":"2024"},{"key":"10.1016\/j.neucom.2026.133584_bib0180","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"3693","article-title":"UVAGaze: unsupervised 1-to-2 views adaptation for gaze estimation","author":"Liu","year":"2024"},{"key":"10.1016\/j.neucom.2026.133584_bib0185","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"3794","article-title":"Goal-oriented gaze estimation for zero-shot learning","author":"Liu","year":"2021"},{"issue":"8","key":"10.1016\/j.neucom.2026.133584_bib0190","doi-asserted-by":"crossref","first-page":"4232","DOI":"10.1109\/TAI.2024.3366174","article-title":"Multistream gaze estimation with anatomical eye region isolation by synthetic to real transfer learning","volume":"5","author":"Mahmud","year":"2024","journal-title":"IEEE Trans. Artif. Intell."},{"key":"10.1016\/j.neucom.2026.133584_bib0195","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"9368","article-title":"Few-shot adaptive gaze estimation","author":"Park","year":"2019"},{"key":"10.1016\/j.neucom.2026.133584_bib0200","series-title":"Proceedings of the European Conference on Computer Vision (ECCV)","first-page":"721","article-title":"Deep pictorial gaze estimation","author":"Park","year":"2018"},{"key":"10.1016\/j.neucom.2026.133584_bib0205","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.neucom.2026.133584_bib0210","unstructured":"A. Radford, K. Narasimhan, T. Salimans, I. Sutskever, et al., Improving language understanding by generative pre-training, 2018."},{"key":"10.1016\/j.neucom.2026.133584_bib0215","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"key":"10.1016\/j.neucom.2026.133584_bib0220","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"10.1016\/j.neucom.2026.133584_bib0225","author":"Ramesh"},{"key":"10.1016\/j.neucom.2026.133584_bib0230","series-title":"International Conference on Machine Learning","first-page":"8821","article-title":"Zero-shot text-to-image generation","author":"Ramesh","year":"2021"},{"key":"10.1016\/j.neucom.2026.133584_bib0235","series-title":"Proceedings of the Augmented Humans International Conference 2025","first-page":"302","article-title":"GazeLLM: multimodal llms incorporating human visual attention","author":"Rekimoto","year":"2025"},{"key":"10.1016\/j.neucom.2026.133584_bib0240","author":"Shen"},{"key":"10.1016\/j.neucom.2026.133584_bib0245","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"9611","article-title":"Proposalclip: unsupervised open-category object proposal generation via exploiting clip cues","author":"Shi","year":"2022"},{"key":"10.1016\/j.neucom.2026.133584_bib0250","author":"Simonyan"},{"key":"10.1016\/j.neucom.2026.133584_bib0255","doi-asserted-by":"crossref","first-page":"923","DOI":"10.3390\/s22030923","article-title":"Robot system assistant (rosa): towards intuitive multi-modal and multi-device human-robot interaction","volume":"22","author":"Strazdas","year":"2022","journal-title":"Sensors"},{"key":"10.1016\/j.neucom.2026.133584_bib0260","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.neucom.2026.133584_bib0265","series-title":"Proceedings of the 32nd ACM International Conference on Multimedia","first-page":"7143","article-title":"TextGaze: gaze-controllable face generation with natural language","author":"Wang","year":"2024"},{"key":"10.1016\/j.neucom.2026.133584_bib0270","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"11907","article-title":"Generalizing eye tracking with Bayesian adversarial learning","author":"Wang","year":"2019"},{"key":"10.1016\/j.neucom.2026.133584_bib0275","doi-asserted-by":"crossref","first-page":"2368","DOI":"10.1109\/TIP.2017.2787612","article-title":"Deep visual attention prediction","volume":"27","author":"Wang","year":"2017","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.neucom.2026.133584_bib0280","author":"Wang"},{"key":"10.1016\/j.neucom.2026.133584_bib0285","series-title":"European Conference on Computer Vision","first-page":"736","article-title":"A simple baseline for open-vocabulary semantic segmentation with pre-trained vision-language model","author":"Xu","year":"2022"},{"key":"10.1016\/j.neucom.2026.133584_bib0290","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"5333","article-title":"Gaze prediction in dynamic 360 immersive videos","author":"Xu","year":"2018"},{"key":"10.1016\/j.neucom.2026.133584_bib0295","series-title":"ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"2760","article-title":"NERF-Gaze: a head-eye redirection parametric model for gaze estimation","author":"Yin","year":"2024"},{"key":"10.1016\/j.neucom.2026.133584_bib0300","series-title":"European Conference on Computer Vision","first-page":"1","article-title":"LG-Gaze: learning geometry-aware continuous prompts for language-guided gaze estimation","author":"Yin","year":"2024"},{"key":"10.1016\/j.neucom.2026.133584_bib0305","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"6729","article-title":"Clip-Gaze: towards general gaze estimation via visual-linguistic model","author":"Yin","year":"2024"},{"key":"10.1016\/j.neucom.2026.133584_bib0310","doi-asserted-by":"crossref","first-page":"93448","DOI":"10.1109\/ACCESS.2019.2928339","article-title":"Driver gaze detection based on deep residual networks using the combined single image of dual near-infrared cameras","volume":"7","author":"Yoon","year":"2019","journal-title":"IEEE Access"},{"key":"10.1016\/j.neucom.2026.133584_bib0315","author":"Yu"},{"key":"10.1016\/j.neucom.2026.133584_bib0320","author":"Yu"},{"key":"10.1016\/j.neucom.2026.133584_bib0325","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"6978","article-title":"Turning a clip model into a scene text detector","author":"Yu","year":"2023"},{"key":"10.1016\/j.neucom.2026.133584_bib0330","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"8552","article-title":"Pointclip: point cloud understanding by clip","author":"Zhang","year":"2022"},{"key":"10.1016\/j.neucom.2026.133584_bib0335","series-title":"Proceedings of the 30th ACM International Conference on Multimedia","first-page":"6868","article-title":"Can language understand depth?","author":"Zhang","year":"2022"},{"key":"10.1016\/j.neucom.2026.133584_bib0340","series-title":"Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part v 16","first-page":"365","article-title":"ETH-XGaze: a large scale dataset for gaze estimation under extreme head pose and gaze variation","author":"Zhang","year":"2020"},{"key":"10.1016\/j.neucom.2026.133584_bib0345","series-title":"Proceedings of the 2018 ACM Symposium on Eye Tracking Research & Applications","first-page":"1","article-title":"Revisiting data normalization for appearance-based gaze estimation","author":"Zhang","year":"2018"},{"key":"10.1016\/j.neucom.2026.133584_bib0350","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"4511","article-title":"Appearance-based gaze estimation in the wild","author":"Zhang","year":"2015"},{"key":"10.1016\/j.neucom.2026.133584_bib0355","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops","first-page":"51","article-title":"It\u2019s written all over your face: full-face appearance-based gaze estimation","author":"Zhang","year":"2017"},{"key":"10.1016\/j.neucom.2026.133584_bib0360","doi-asserted-by":"crossref","first-page":"162","DOI":"10.1109\/TPAMI.2017.2778103","article-title":"MPIIGaze: real-world dataset and deep appearance-based gaze estimation","volume":"41","author":"Zhang","year":"2017","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.neucom.2026.133584_bib0365","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"16816","article-title":"Conditional prompt learning for vision-language models","author":"Zhou","year":"2022"}],"container-title":["Neurocomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231226009811?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231226009811?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T19:18:00Z","timestamp":1777576680000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0925231226009811"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,7]]},"references-count":73,"alternative-id":["S0925231226009811"],"URL":"https:\/\/doi.org\/10.1016\/j.neucom.2026.133584","relation":{},"ISSN":["0925-2312"],"issn-type":[{"value":"0925-2312","type":"print"}],"subject":[],"published":{"date-parts":[[2026,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"GazeCLIP: Enhancing gaze estimation through text-guided multimodal learning","name":"articletitle","label":"Article Title"},{"value":"Neurocomputing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neucom.2026.133584","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Published by Elsevier B.V.","name":"copyright","label":"Copyright"}],"article-number":"133584"}}