{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,14]],"date-time":"2026-05-14T23:09:09Z","timestamp":1778800149244,"version":"3.51.4"},"reference-count":76,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Computer Vision and Image Understanding"],"published-print":{"date-parts":[[2026,5]]},"DOI":"10.1016\/j.cviu.2026.104740","type":"journal-article","created":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T07:02:01Z","timestamp":1775026921000},"page":"104740","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Learning long- and short-term dynamics for human attention prediction using large video models"],"prefix":"10.1016","volume":"268","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6825-5171","authenticated-orcid":false,"given":"Morteza","family":"Moradi","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0006-1009","authenticated-orcid":false,"given":"Mohammad","family":"Moradi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8198-0335","authenticated-orcid":false,"given":"Ali","family":"Borji","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6122-4249","authenticated-orcid":false,"given":"Federica","family":"Proietto Salanitri","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Giovanni","family":"Bellitto","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Francesco","family":"Rundo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2441-0982","authenticated-orcid":false,"given":"Simone","family":"Palazzo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6653-2577","authenticated-orcid":false,"given":"Concetto","family":"Spampinato","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"issue":"3","key":"10.1016\/j.cviu.2026.104740_b1","doi-asserted-by":"crossref","first-page":"709","DOI":"10.1016\/j.neuron.2017.06.041","article-title":"Discovering event structure in continuous narrative perception and memory","volume":"95","author":"Baldassano","year":"2017","journal-title":"Neuron"},{"key":"10.1016\/j.cviu.2026.104740_b2","doi-asserted-by":"crossref","first-page":"3216","DOI":"10.1007\/s11263-021-01519-y","article-title":"Hierarchical domain-adapted feature learning for video saliency prediction","volume":"129","author":"Bellitto","year":"2021","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.cviu.2026.104740_b3","first-page":"4","article-title":"Is space-time attention all you need for video understanding?","volume":"vol. 2","author":"Bertasius","year":"2021"},{"issue":"3","key":"10.1016\/j.cviu.2026.104740_b4","doi-asserted-by":"crossref","first-page":"740","DOI":"10.1109\/TPAMI.2018.2815601","article-title":"What do different evaluation metrics tell us about saliency models?","volume":"41","author":"Bylinskii","year":"2018","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.cviu.2026.104740_b5","series-title":"Temporal-spatial feature pyramid for video saliency detection","author":"Chang","year":"2021"},{"key":"10.1016\/j.cviu.2026.104740_b6","article-title":"Drosophila-vision-inspired motion perception model and its application in saliency detection","author":"Chen","year":"2024","journal-title":"IEEE Trans. Consum. Electron."},{"key":"10.1016\/j.cviu.2026.104740_b7","series-title":"2022 International Conference on Smart Technologies and Systems for Next Generation Computing","first-page":"1","article-title":"Automatic video summarization for cricket match highlights using convolutional neural network","author":"Dange","year":"2022"},{"key":"10.1016\/j.cviu.2026.104740_b8","doi-asserted-by":"crossref","unstructured":"Droste,\u00a0R., Jiao,\u00a0J., Noble,\u00a0J.A., Unified Image and Video Saliency Modeling. In: Vedaldi,\u00a0A., Bischof,\u00a0H., Brox,\u00a0T., Frahm,\u00a0J. (Eds.), ECCV 2020.","DOI":"10.1007\/978-3-030-58558-7_25"},{"key":"10.1016\/j.cviu.2026.104740_b9","series-title":"European Conference on Computer Vision","first-page":"419","article-title":"Unified image and video saliency modeling","author":"Droste","year":"2020"},{"key":"10.1016\/j.cviu.2026.104740_b10","series-title":"Violet: End-to-end video-language transformers with masked visual-token modeling","author":"Fu","year":"2021"},{"issue":"6","key":"10.1016\/j.cviu.2026.104740_b11","doi-asserted-by":"crossref","first-page":"8268","DOI":"10.1007\/s11227-021-04151-2","article-title":"Driver attention prediction based on convolution and transformers","volume":"78","author":"Gou","year":"2022","journal-title":"J. Supercomput."},{"issue":"1","key":"10.1016\/j.cviu.2026.104740_b12","doi-asserted-by":"crossref","first-page":"87","DOI":"10.1109\/TPAMI.2022.3152247","article-title":"A survey on vision transformer","volume":"45","author":"Han","year":"2022","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"10","key":"10.1016\/j.cviu.2026.104740_b13","doi-asserted-by":"crossref","first-page":"2539","DOI":"10.1523\/JNEUROSCI.5487-07.2008","article-title":"A hierarchy of temporal receptive windows in human cortex","volume":"28","author":"Hasson","year":"2008","journal-title":"J. Neurosci."},{"issue":"2","key":"10.1016\/j.cviu.2026.104740_b14","doi-asserted-by":"crossref","first-page":"423","DOI":"10.1016\/j.neuron.2012.08.011","article-title":"Slow cortical dynamics and the accumulation of information over long timescales","volume":"76","author":"Honey","year":"2012","journal-title":"Neuron"},{"key":"10.1016\/j.cviu.2026.104740_b15","article-title":"Context-aware driver attention estimation using multi-hierarchy saliency fusion with gaze tracking","author":"Hu","year":"2024","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"10.1016\/j.cviu.2026.104740_b16","doi-asserted-by":"crossref","unstructured":"Jain,\u00a0S., Yarlagadda,\u00a0P., Jyoti,\u00a0S., Karthik,\u00a0S., Subramanian,\u00a0R., Gandhi,\u00a0V., ViNet: Pushing the limits of Visual Modality for Audio-Visual Saliency Prediction. In: IROS 2021.","DOI":"10.1109\/IROS51168.2021.9635989"},{"key":"10.1016\/j.cviu.2026.104740_b17","series-title":"2021 IEEE\/RSJ International Conference on Intelligent Robots and Systems","first-page":"8444","article-title":"Robust top-down and bottom-up visual saliency for mobile robots using bio-inspired design principles","author":"Jaramillo-Avila","year":"2021"},{"key":"10.1016\/j.cviu.2026.104740_b18","doi-asserted-by":"crossref","unstructured":"Jiang,\u00a0M., Huang,\u00a0S., Duan,\u00a0J., Zhao,\u00a0Q., 2015. Salicon: Saliency in context. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 1072\u20131080.","DOI":"10.1109\/CVPR.2015.7298710"},{"key":"10.1016\/j.cviu.2026.104740_b19","series-title":"Predicting video saliency with object-to-motion CNN and two-layer convolutional LSTM","author":"Jiang","year":"2017"},{"issue":"1","key":"10.1016\/j.cviu.2026.104740_b20","doi-asserted-by":"crossref","first-page":"1792","DOI":"10.1109\/TIV.2023.3275543","article-title":"An attention-guided multistream feature fusion network for early localization of risky traffic agents in driving videos","volume":"9","author":"Karim","year":"2023","journal-title":"IEEE Trans. Intell. Veh."},{"key":"10.1016\/j.cviu.2026.104740_b21","series-title":"The kinetics human action video dataset","author":"Kay","year":"2017"},{"key":"10.1016\/j.cviu.2026.104740_b22","unstructured":"Kingma,\u00a0D.P., Ba,\u00a0J., 2015. Adam: A Method for Stochastic Optimization. In: ICLR 2015."},{"issue":"1","key":"10.1016\/j.cviu.2026.104740_b23","doi-asserted-by":"crossref","DOI":"10.1080\/08839514.2022.2094408","article-title":"Saliency detection using a bio-inspired spiking neural network driven by local and global saliency","volume":"36","author":"Lad","year":"2022","journal-title":"Appl. Artif. Intell."},{"key":"10.1016\/j.cviu.2026.104740_b24","doi-asserted-by":"crossref","first-page":"1113","DOI":"10.1109\/TIP.2019.2936112","article-title":"Video saliency prediction using spatiotemporal residual attentive networks","volume":"29","author":"Lai","year":"2019","journal-title":"IEEE Trans. Image Process."},{"issue":"4","key":"10.1016\/j.cviu.2026.104740_b25","doi-asserted-by":"crossref","first-page":"47","DOI":"10.1007\/s00138-023-01405-2","article-title":"Saliency prediction based on multi-channel models of visual processing","volume":"34","author":"Li","year":"2023","journal-title":"Mach. Vis. Appl."},{"key":"10.1016\/j.cviu.2026.104740_b26","doi-asserted-by":"crossref","unstructured":"Li,\u00a0L., Gan,\u00a0Z., Lin,\u00a0K., Lin,\u00a0C.-C., Liu,\u00a0Z., Liu,\u00a0C., Wang,\u00a0L., 2023. Lavender: Unifying video-language understanding as masked language modeling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 23119\u201323129.","DOI":"10.1109\/CVPR52729.2023.02214"},{"key":"10.1016\/j.cviu.2026.104740_b27","series-title":"IEEE\/CVF International Conference on Computer Vision","first-page":"19891","article-title":"Unmasked teacher: Towards training-efficient video foundation models","author":"Li","year":"2023"},{"key":"10.1016\/j.cviu.2026.104740_b28","series-title":"European Conference on Computer Vision","first-page":"388","article-title":"Frozen clip models are efficient video learners","author":"Lin","year":"2022"},{"key":"10.1016\/j.cviu.2026.104740_b29","series-title":"30th British Machine Vision Conference 2019","first-page":"182","article-title":"Simple vs complex temporal recurrences for video saliency prediction","author":"Linardos","year":"2019"},{"key":"10.1016\/j.cviu.2026.104740_b30","series-title":"2024 IEEE Intelligent Vehicles Symposium","first-page":"573","article-title":"Mstf: Multiscale transformer for incomplete trajectory prediction","author":"Liu","year":"2024"},{"key":"10.1016\/j.cviu.2026.104740_b31","doi-asserted-by":"crossref","unstructured":"Liu,\u00a0Z., Ning,\u00a0J., Cao,\u00a0Y., Wei,\u00a0Y., Zhang,\u00a0Z., Lin,\u00a0S., Hu,\u00a0H., 2022. Video swin transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 3202\u20133211.","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"10.1016\/j.cviu.2026.104740_b32","doi-asserted-by":"crossref","first-page":"455","DOI":"10.1016\/j.neucom.2022.04.080","article-title":"TranSalNet: Towards perceptually relevant visual saliency prediction","volume":"494","author":"Lou","year":"2022","journal-title":"Neurocomputing"},{"issue":"10","key":"10.1016\/j.cviu.2026.104740_b33","doi-asserted-by":"crossref","first-page":"6850","DOI":"10.1109\/TCSVT.2022.3172971","article-title":"Video saliency forecasting transformer","volume":"32","author":"Ma","year":"2022","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.cviu.2026.104740_b34","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2024.123623","article-title":"A bio-inspired exogenous attention-based architecture for social robots","volume":"249","author":"Marques-Villarroya","year":"2024","journal-title":"Expert Syst. Appl."},{"issue":"5","key":"10.1016\/j.cviu.2026.104740_b35","doi-asserted-by":"crossref","first-page":"548","DOI":"10.1016\/j.cviu.2009.12.007","article-title":"A biologically-inspired vision architecture for resource-constrained intelligent vehicles","volume":"114","author":"Michalke","year":"2010","journal-title":"Comput. Vis. Image Underst."},{"key":"10.1016\/j.cviu.2026.104740_b36","doi-asserted-by":"crossref","unstructured":"Min,\u00a0K., Corso,\u00a0J.J., 2019. Tased-net: Temporally-aggregating spatial encoder-decoder network for video saliency detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 2394\u20132403.","DOI":"10.1109\/ICCV.2019.00248"},{"issue":"1","key":"10.1016\/j.cviu.2026.104740_b37","first-page":"1","article-title":"Fixation prediction through multimodal analysis","volume":"13","author":"Min","year":"2016","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl. (TOMM)"},{"key":"10.1016\/j.cviu.2026.104740_b38","series-title":"ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"1","article-title":"Distilling knowledge from large video models for driver visual attention prediction","author":"Moradi","year":"2025"},{"key":"10.1016\/j.cviu.2026.104740_b39","first-page":"1","article-title":"Recent advancements in driver\u2019s attention prediction","author":"Moradi","year":"2024","journal-title":"Multimedia Tools Appl."},{"key":"10.1016\/j.cviu.2026.104740_b40","series-title":"Proceedings of the 19th International Joint Conference on Computer Vision, Imaging and Computer Graphics Theory and Applications","first-page":"616","article-title":"Transformer-based video saliency prediction with high temporal dimension decoding","author":"Moradi","year":"2024"},{"key":"10.1016\/j.cviu.2026.104740_b41","series-title":"European Conference on Computer Vision","first-page":"178","article-title":"AIM 2024 challenge on video saliency prediction: Methods and results","author":"Moskalenko","year":"2025"},{"issue":"7","key":"10.1016\/j.cviu.2026.104740_b42","doi-asserted-by":"crossref","first-page":"1720","DOI":"10.1109\/TPAMI.2018.2845370","article-title":"Predicting the driver\u2019s focus of attention: the DR (eye) VE project","volume":"41","author":"Palazzi","year":"2018","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.cviu.2026.104740_b43","series-title":"Salgan: Visual saliency prediction with generative adversarial networks","author":"Pan","year":"2017"},{"key":"10.1016\/j.cviu.2026.104740_b44","doi-asserted-by":"crossref","DOI":"10.1016\/j.asoc.2022.109285","article-title":"Multimodal learning model based on video\u2013audio\u2013chat feature fusion for detecting e-sports highlights","volume":"126","author":"Park","year":"2022","journal-title":"Appl. Soft Comput."},{"key":"10.1016\/j.cviu.2026.104740_b45","series-title":"Towards robust unsupervised attention prediction in autonomous driving","author":"Qi","year":"2025"},{"key":"10.1016\/j.cviu.2026.104740_b46","series-title":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"6615","article-title":"Saliency prediction of sports videos: A large-scale database and a self-adaptive approach","author":"Qiao","year":"2024"},{"key":"10.1016\/j.cviu.2026.104740_b47","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"issue":"ETRA","key":"10.1016\/j.cviu.2026.104740_b48","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3530887","article-title":"Where and what: Driver attention-based object detection","volume":"6","author":"Rong","year":"2022","journal-title":"Proc. the ACM Human-Computer Interact."},{"key":"10.1016\/j.cviu.2026.104740_b49","doi-asserted-by":"crossref","unstructured":"Tsiami,\u00a0A., Koutras,\u00a0P., Maragos,\u00a0P., 2020. Stavis: Spatio-temporal audiovisual saliency network. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 4766\u20134776.","DOI":"10.1109\/CVPR42600.2020.00482"},{"key":"10.1016\/j.cviu.2026.104740_b50","doi-asserted-by":"crossref","unstructured":"Vig,\u00a0E., Dorr,\u00a0M., Cox,\u00a0D., 2014. Large-scale optimization of hierarchical features for saliency prediction in natural images. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 2798\u20132805.","DOI":"10.1109\/CVPR.2014.358"},{"key":"10.1016\/j.cviu.2026.104740_b51","series-title":"2019 IEEE 21st International Workshop on Multimedia Signal Processing","first-page":"1","article-title":"YouTube UGC dataset for video compression research","author":"Wang","year":"2019"},{"key":"10.1016\/j.cviu.2026.104740_b52","article-title":"Spatio-temporal self-attention network for video saliency prediction","author":"Wang","year":"2021","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.cviu.2026.104740_b53","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2021.104216","article-title":"Spatiotemporal module for video saliency prediction based on self-attention","volume":"112","author":"Wang","year":"2021","journal-title":"Image Vis. Comput."},{"issue":"1","key":"10.1016\/j.cviu.2026.104740_b54","doi-asserted-by":"crossref","first-page":"220","DOI":"10.1109\/TPAMI.2019.2924417","article-title":"Revisiting video saliency prediction in the deep learning era","volume":"43","author":"Wang","year":"2019","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"1","key":"10.1016\/j.cviu.2026.104740_b55","doi-asserted-by":"crossref","first-page":"220","DOI":"10.1109\/TPAMI.2019.2924417","article-title":"Revisiting video saliency prediction in the deep learning era","volume":"43","author":"Wang","year":"2019","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.cviu.2026.104740_b56","doi-asserted-by":"crossref","unstructured":"Wu,\u00a0X., Wu,\u00a0Z., Zhang,\u00a0J., Ju,\u00a0L., Wang,\u00a0S., SalSAC: A Video Saliency Prediction Model with Shuffled Attentions and Correlation-Based ConvLSTM. In: AAAI 2020.","DOI":"10.1609\/aaai.v34i07.6927"},{"issue":"22","key":"10.1016\/j.cviu.2026.104740_b57","doi-asserted-by":"crossref","first-page":"27865","DOI":"10.1007\/s10489-023-04861-5","article-title":"GFNet: gated fusion network for video saliency prediction","volume":"53","author":"Wu","year":"2023","journal-title":"Appl. Intell."},{"key":"10.1016\/j.cviu.2026.104740_b58","doi-asserted-by":"crossref","DOI":"10.1016\/j.cviu.2024.104105","article-title":"Joint pyramidal perceptual attention and hierarchical consistency constraint for gaze estimation","volume":"248","author":"Xia","year":"2024","journal-title":"Comput. Vis. Image Underst."},{"key":"10.1016\/j.cviu.2026.104740_b59","series-title":"Computer Vision\u2013ACCV 2018: 14th Asian Conference on Computer Vision, Perth, Australia, December 2\u20136, 2018, Revised Selected Papers, Part V 14","first-page":"658","article-title":"Predicting driver attention in critical situations","author":"Xia","year":"2019"},{"key":"10.1016\/j.cviu.2026.104740_b60","doi-asserted-by":"crossref","unstructured":"Xu,\u00a0H., Gao,\u00a0Y., Yu,\u00a0F., Darrell,\u00a0T., 2017. End-to-end learning of driving models from large-scale video datasets. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 2174\u20132182.","DOI":"10.1109\/CVPR.2017.376"},{"key":"10.1016\/j.cviu.2026.104740_b61","doi-asserted-by":"crossref","DOI":"10.1016\/j.compeleceng.2024.109104","article-title":"TransConvNet: Perform perceptually relevant driver\u2019s visual attention predictions","volume":"115","author":"Xu","year":"2024","journal-title":"Comput. Electr. Eng."},{"key":"10.1016\/j.cviu.2026.104740_b62","doi-asserted-by":"crossref","first-page":"23","DOI":"10.1016\/j.neucom.2018.09.093","article-title":"A bio-inspired motion sensitive model and its application to estimating human gaze positions under classified driving conditions","volume":"345","author":"Xu","year":"2019","journal-title":"Neurocomputing"},{"key":"10.1016\/j.cviu.2026.104740_b63","series-title":"2021 IEEE International Conference on Image Processing","first-page":"1604","article-title":"Deep audio-visual fusion neural network for saliency estimation","author":"Yao","year":"2021"},{"key":"10.1016\/j.cviu.2026.104740_b64","series-title":"Coca: Contrastive captioners are image-text foundation models","author":"Yu","year":"2022"},{"issue":"7","key":"10.1016\/j.cviu.2026.104740_b65","doi-asserted-by":"crossref","first-page":"3562","DOI":"10.1109\/TCYB.2019.2931735","article-title":"Bio-inspired representation learning for visual attention prediction","volume":"51","author":"Yuan","year":"2019","journal-title":"IEEE Trans. Cybern."},{"issue":"2","key":"10.1016\/j.cviu.2026.104740_b66","doi-asserted-by":"crossref","first-page":"80","DOI":"10.1111\/j.1467-8721.2007.00480.x","article-title":"Event segmentation","volume":"16","author":"Zacks","year":"2007","journal-title":"Curr. Dir. Psychol. Sci."},{"key":"10.1016\/j.cviu.2026.104740_b67","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2024.123568","article-title":"Attention-guided multi-granularity fusion model for video summarization","volume":"249","author":"Zhang","year":"2024","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.cviu.2026.104740_b68","article-title":"A self validation network for object-level human attention estimation","volume":"32","author":"Zhang","year":"2019","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.cviu.2026.104740_b69","article-title":"Multi-scale spatiotemporal feature fusion network for video saliency prediction","author":"Zhang","year":"2023","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.cviu.2026.104740_b70","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2023.104744","article-title":"Accurate video saliency prediction via hierarchical fusion and temporal recurrence","volume":"136","author":"Zhang","year":"2023","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.cviu.2026.104740_b71","doi-asserted-by":"crossref","first-page":"841","DOI":"10.1007\/s12559-014-9266-z","article-title":"A novel biologically inspired visual saliency model","volume":"6","author":"Zhao","year":"2014","journal-title":"Cogn. Comput."},{"issue":"12","key":"10.1016\/j.cviu.2026.104740_b72","doi-asserted-by":"crossref","first-page":"7696","DOI":"10.1109\/TCSVT.2023.3278410","article-title":"Transformer-based multi-scale feature integration network for video saliency prediction","volume":"33","author":"Zhou","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.cviu.2026.104740_b73","doi-asserted-by":"crossref","DOI":"10.1016\/j.image.2020.115802","article-title":"Human action recognition toward massive-scale sport sceneries based on deep multi-model feature fusion","volume":"84","author":"Zhou","year":"2020","journal-title":"Signal Process., Image Commun."},{"issue":"4","key":"10.1016\/j.cviu.2026.104740_b74","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3576857","article-title":"A novel lightweight audio-visual saliency model for videos","volume":"19","author":"Zhu","year":"2023","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"issue":"6","key":"10.1016\/j.cviu.2026.104740_b75","doi-asserted-by":"crossref","first-page":"4059","DOI":"10.1109\/TETCI.2024.3386619","article-title":"From discrete representation to continuous modeling: A novel audio-visual saliency prediction model with implicit neural representations","volume":"8","author":"Zhu","year":"2024","journal-title":"IEEE Trans. Emerg. Top. Comput. Intell."},{"issue":"2","key":"10.1016\/j.cviu.2026.104740_b76","doi-asserted-by":"crossref","first-page":"1756","DOI":"10.1109\/TETCI.2024.3358184","article-title":"MTCAM: a novel weakly-supervised audio-visual saliency prediction model with multi-modal transformer","volume":"8","author":"Zhu","year":"2024","journal-title":"IEEE Trans. Emerg. Top. Comput. Intell."}],"container-title":["Computer Vision and Image Understanding"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1077314226001074?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1077314226001074?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,14]],"date-time":"2026-05-14T22:32:14Z","timestamp":1778797934000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1077314226001074"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5]]},"references-count":76,"alternative-id":["S1077314226001074"],"URL":"https:\/\/doi.org\/10.1016\/j.cviu.2026.104740","relation":{},"ISSN":["1077-3142"],"issn-type":[{"value":"1077-3142","type":"print"}],"subject":[],"published":{"date-parts":[[2026,5]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Learning long- and short-term dynamics for human attention prediction using large video models","name":"articletitle","label":"Article Title"},{"value":"Computer Vision and Image Understanding","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.cviu.2026.104740","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Inc. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"104740"}}