{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T15:20:35Z","timestamp":1743002435830,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":30,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819786916"},{"type":"electronic","value":"9789819786923"}],"license":[{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-97-8692-3_14","type":"book-chapter","created":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T14:03:04Z","timestamp":1730383384000},"page":"191-205","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Vision-Language Knowledge Exploration for Video Saliency Prediction"],"prefix":"10.1007","author":[{"given":"Fei","family":"Zhou","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Baitao","family":"Huang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guoping","family":"Qiu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,1]]},"reference":[{"key":"14_CR1","doi-asserted-by":"publisher","first-page":"3216","DOI":"10.1007\/s11263-021-01519-y","volume":"129","author":"G Bellitto","year":"2021","unstructured":"Bellitto, G., Proietto Salanitri, F., Palazzo, S., Rundo, F., Giordano, D., Spampinato, C.: Hierarchical domain-adapted feature learning for video saliency prediction. Int. J. Comput. Vision 129, 3216\u20133232 (2021)","journal-title":"Int. J. Comput. Vision"},{"issue":"3","key":"14_CR2","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1109\/TPAMI.2018.2815601","volume":"41","author":"Z Bylinskii","year":"2018","unstructured":"Bylinskii, Z., Judd, T., Oliva, A., Torralba, A., Durand, F.: What do different evaluation metrics tell us about saliency models? IEEE Trans. Pattern Anal. Mach. Intell. 41(3), 740\u2013757 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"14_CR3","doi-asserted-by":"publisher","first-page":"1882","DOI":"10.1109\/TIP.2023.3251695","volume":"32","author":"Y Cao","year":"2023","unstructured":"Cao, Y., Min, X., Sun, W., Zhai, G.: Attention-guided neural networks for full-reference and no-reference audio-visual quality assessment. IEEE Trans. Image Process. 32, 1882\u20131896 (2023)","journal-title":"IEEE Trans. Image Process."},{"key":"14_CR4","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et\u00a0al.: An image is worth 16 $$\\times $$ 16 words: transformers for image recognition at scale (2020). arXiv:2010.11929"},{"key":"14_CR5","doi-asserted-by":"crossref","unstructured":"Droste, R., Jiao, J., Noble, J.A.: Unified image and video saliency modeling. In: Computer Vision\u2014ECCV 2020: 16th European Conference, Glasgow, UK, Proceedings, Part V 16, pp. 419\u2013435. Springer (2020)","DOI":"10.1007\/978-3-030-58558-7_25"},{"issue":"10","key":"14_CR6","doi-asserted-by":"publisher","first-page":"1192","DOI":"10.1016\/j.visres.2011.03.010","volume":"51","author":"AD Hwang","year":"2011","unstructured":"Hwang, A.D., Wang, H.C., Pomplun, M.: Semantic guidance of eye movements in real-world scenes. Vision. Res. 51(10), 1192\u20131205 (2011)","journal-title":"Vision. Res."},{"key":"14_CR7","doi-asserted-by":"crossref","unstructured":"Jain, S., Yarlagadda, P., Jyoti, S., Karthik, S., Subramanian, R., Gandhi, V.: Vinet: pushing the limits of visual modality for audio-visual saliency prediction. In: 2021 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS), pp. 3520\u20133527. IEEE (2021)","DOI":"10.1109\/IROS51168.2021.9635989"},{"key":"14_CR8","doi-asserted-by":"crossref","unstructured":"Jiang, L., Xu, M., Liu, T., Qiao, M., Wang, Z.: Deepvs: A deep learning based video saliency prediction approach. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 602\u2013617 (2018)","DOI":"10.1007\/978-3-030-01264-9_37"},{"key":"14_CR9","doi-asserted-by":"crossref","unstructured":"Judd, T., Ehinger, K., Durand, F., Torralba, A.: Learning to predict where humans look. In: 2009 IEEE 12th International Conference on Computer Vision, pp. 2106\u20132113. IEEE (2009)","DOI":"10.1109\/ICCV.2009.5459462"},{"key":"14_CR10","doi-asserted-by":"publisher","first-page":"1113","DOI":"10.1109\/TIP.2019.2936112","volume":"29","author":"Q Lai","year":"2019","unstructured":"Lai, Q., Wang, W., Sun, H., Shen, J.: Video saliency prediction using spatiotemporal residual attentive networks. IEEE Trans. Image Process. 29, 1113\u20131126 (2019)","journal-title":"IEEE Trans. Image Process."},{"key":"14_CR11","doi-asserted-by":"crossref","unstructured":"Li, L., Gan, Z., Lin, K., Lin, C.C., Liu, Z., Liu, C., Wang, L.: Lavender: unifying video-language understanding as masked language modeling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23119\u201323129 (2023)","DOI":"10.1109\/CVPR52729.2023.02214"},{"key":"14_CR12","doi-asserted-by":"crossref","unstructured":"Lin, K., Li, L., Lin, C.C., Ahmed, F., Gan, Z., Liu, Z., Lu, Y., Wang, L.: Swinbert: end-to-end transformers with sparse attention for video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17949\u201317958 (2022)","DOI":"10.1109\/CVPR52688.2022.01742"},{"key":"14_CR13","unstructured":"Linardos, P., Mohedano, E., Nieto, J.J., O\u2019Connor, N.E., Giro-i Nieto, X., McGuinness, K.: Simple versus complex temporal recurrences for video saliency prediction (2019). arXiv:1907.01869"},{"issue":"10","key":"14_CR14","doi-asserted-by":"publisher","first-page":"6850","DOI":"10.1109\/TCSVT.2022.3172971","volume":"32","author":"C Ma","year":"2022","unstructured":"Ma, C., Sun, H., Rao, Y., Zhou, J., Lu, J.: Video saliency forecasting transformer. IEEE Trans. Circuits Syst. Video Technol. 32(10), 6850\u20136862 (2022)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"7","key":"14_CR15","doi-asserted-by":"publisher","first-page":"1408","DOI":"10.1109\/TPAMI.2014.2366154","volume":"37","author":"S Mathe","year":"2014","unstructured":"Mathe, S., Sminchisescu, C.: Actions in the eye: dynamic gaze datasets and learnt saliency models for visual recognition. IEEE Trans. Pattern Anal. Mach. Intell. 37(7), 1408\u20131424 (2014)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"14_CR16","doi-asserted-by":"publisher","first-page":"5","DOI":"10.1007\/s12559-010-9074-z","volume":"3","author":"PK Mital","year":"2011","unstructured":"Mital, P.K., Smith, T.J., Hill, R.L., Henderson, J.M.: Clustering of gaze during dynamic scene viewing is predicted by motion. Cogn. Comput. 3, 5\u201324 (2011)","journal-title":"Cogn. Comput."},{"key":"14_CR17","doi-asserted-by":"crossref","unstructured":"Ouyang, S., Wang, H., Xie, S., Niu, Z., Tong, R., Chen, Y.W., Lin, L.: Slvit: scale-wise language-guided vision transformer for referring image segmentation. In: Proceedings of the Thirty-Second International Joint Conference on Artificial Intelligence, IJCAI-23, pp. 1294\u20131302 (2023)","DOI":"10.24963\/ijcai.2023\/144"},{"key":"14_CR18","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"14_CR19","first-page":"10078","volume":"35","author":"Z Tong","year":"2022","unstructured":"Tong, Z., Song, Y., Wang, J., Wang, L.: Videomae: masked autoencoders are data-efficient learners for self-supervised video pre-training. Adv. Neural. Inf. Process. Syst. 35, 10078\u201310093 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"14_CR20","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"issue":"1","key":"14_CR21","doi-asserted-by":"publisher","first-page":"220","DOI":"10.1109\/TPAMI.2019.2924417","volume":"43","author":"W Wang","year":"2019","unstructured":"Wang, W., Shen, J., Xie, J., Cheng, M.M., Ling, H., Borji, A.: Revisiting video saliency prediction in the deep learning era. IEEE Trans. Pattern Anal. Mach. Intell. 43(1), 220\u2013237 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"14_CR22","doi-asserted-by":"publisher","first-page":"1161","DOI":"10.1109\/TMM.2021.3139743","volume":"25","author":"Z Wang","year":"2021","unstructured":"Wang, Z., Liu, Z., Li, G., Wang, Y., Zhang, T., Xu, L., Wang, J.: Spatio-temporal self-attention network for video saliency prediction. IEEE Trans. Multimedia 25, 1161\u20131174 (2021)","journal-title":"IEEE Trans. Multimedia"},{"key":"14_CR23","doi-asserted-by":"crossref","unstructured":"Wu, W., Wang, X., Luo, H., Wang, J., Yang, Y., Ouyang, W.: Bidirectional cross-modal knowledge exploration for video recognition with pre-trained vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6620\u20136630 (2023)","DOI":"10.1109\/CVPR52729.2023.00640"},{"key":"14_CR24","doi-asserted-by":"crossref","unstructured":"Wu, X., Wu, Z., Zhang, J., Ju, L., Wang, S.: Salsac: a video saliency prediction model with shuffled attentions and correlation-based CONVLSTM. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a034, pp. 12410\u201312417 (2020)","DOI":"10.1609\/aaai.v34i07.6927"},{"issue":"3","key":"14_CR25","doi-asserted-by":"publisher","first-page":"576","DOI":"10.1109\/TPAMI.2016.2547384","volume":"39","author":"J Yang","year":"2016","unstructured":"Yang, J., Yang, M.H.: Top-down visual saliency via joint CRF and dictionary learning. IEEE Trans. Pattern Anal. Mach. Intell. 39(3), 576\u2013588 (2016)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"14_CR26","doi-asserted-by":"crossref","unstructured":"Yu, Z., Yu, J., Fan, J., Tao, D.: Multi-modal factorized bilinear pooling with co-attention learning for visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1821\u20131830 (2017)","DOI":"10.1109\/ICCV.2017.202"},{"issue":"4","key":"14_CR27","doi-asserted-by":"publisher","first-page":"787","DOI":"10.1037\/a0013118","volume":"115","author":"GJ Zelinsky","year":"2008","unstructured":"Zelinsky, G.J.: A theory of eye movements during target acquisition. Psychol. Rev. 115(4), 787 (2008)","journal-title":"Psychol. Rev."},{"key":"14_CR28","doi-asserted-by":"publisher","first-page":"4183","DOI":"10.1109\/TMM.2023.3321394","volume":"26","author":"Y Zhang","year":"2023","unstructured":"Zhang, Y., Zhang, T., Wu, C., Tao, R.: Multi-scale spatiotemporal feature fusion network for video saliency prediction. IEEE Trans. Multimedia 26, 4183\u20134193 (2023)","journal-title":"IEEE Trans. Multimedia"},{"issue":"12","key":"14_CR29","doi-asserted-by":"publisher","first-page":"7696","DOI":"10.1109\/TCSVT.2023.3278410","volume":"33","author":"X Zhou","year":"2023","unstructured":"Zhou, X., Wu, S., Shi, R., Zheng, B., Wang, S., Yin, H., Zhang, J., Yan, C.: Transformer-based multi-scale feature integration network for video saliency prediction. IEEE Trans. Circuits Syst. Video Technol. 33(12), 7696\u20137707 (2023)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"10","key":"14_CR30","doi-asserted-by":"publisher","first-page":"7955","DOI":"10.1007\/s00521-022-06895-1","volume":"34","author":"S Zhu","year":"2022","unstructured":"Zhu, S., Chang, Q., Li, Q.: Video saliency aware intelligent HD video compression with the improvement of visual quality and the reduction of coding complexity. Neural Comput. Appl. 34(10), 7955\u20137974 (2022)","journal-title":"Neural Comput. Appl."}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-8692-3_14","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T14:25:13Z","timestamp":1730384713000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-8692-3_14"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,1]]},"ISBN":["9789819786916","9789819786923"],"references-count":30,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-8692-3_14","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,1]]},"assertion":[{"value":"1 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Urumqi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2024.prcv.cn\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}