{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,10]],"date-time":"2025-09-10T22:49:04Z","timestamp":1757544544670,"version":"3.40.3"},"publisher-location":"Cham","reference-count":36,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031783111"},{"type":"electronic","value":"9783031783128"}],"license":[{"start":{"date-parts":[[2024,12,4]],"date-time":"2024-12-04T00:00:00Z","timestamp":1733270400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,4]],"date-time":"2024-12-04T00:00:00Z","timestamp":1733270400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78312-8_3","type":"book-chapter","created":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T12:36:25Z","timestamp":1733229385000},"page":"33-48","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["SalFoM: Dynamic Saliency Prediction with Video Foundation Models"],"prefix":"10.1007","author":[{"given":"Morteza","family":"Moradi","sequence":"first","affiliation":[]},{"given":"Mohammad","family":"Moradi","sequence":"additional","affiliation":[]},{"given":"Francesco","family":"Rundo","sequence":"additional","affiliation":[]},{"given":"Concetto","family":"Spampinato","sequence":"additional","affiliation":[]},{"given":"Ali","family":"Borji","sequence":"additional","affiliation":[]},{"given":"Simone","family":"Palazzo","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,4]]},"reference":[{"key":"3_CR1","unstructured":"Awais, M., Naseer, M., Khan, S., Anwer, R.M., Cholakkal, H., Shah, M., Yang, M.H., Khan, F.S.: Foundational models defining a new era in vision: A survey and outlook. arXiv preprint arXiv:2307.13721 (2023)"},{"key":"3_CR2","doi-asserted-by":"publisher","first-page":"3216","DOI":"10.1007\/s11263-021-01519-y","volume":"129","author":"G Bellitto","year":"2021","unstructured":"Bellitto, G., Proietto Salanitri, F., Palazzo, S., Rundo, F., Giordano, D., Spampinato, C.: Hierarchical domain-adapted feature learning for video saliency prediction. Int. J. Comput. Vision 129, 3216\u20133232 (2021)","journal-title":"Int. J. Comput. Vision"},{"key":"3_CR3","unstructured":"Bertasius, G., Wang, H., Torresani, L.: Is space-time attention all you need for video understanding? In: ICML. vol.\u00a02 (2021)"},{"key":"3_CR4","unstructured":"Bommasani, R., Hudson, D.A., Adeli, E., Altman, R., Arora, S., von Arx, S., Bernstein, M.S., Bohg, J., Bosselut, A., Brunskill, E., et\u00a0al.: On the opportunities and risks of foundation models. arXiv preprint arXiv:2108.07258 (2021)"},{"issue":"3","key":"3_CR5","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1109\/TPAMI.2018.2815601","volume":"41","author":"Z Bylinskii","year":"2018","unstructured":"Bylinskii, Z., Judd, T., Oliva, A., Torralba, A., Durand, F.: What do different evaluation metrics tell us about saliency models? IEEE Trans. Pattern Anal. Mach. Intell. 41(3), 740\u2013757 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3_CR6","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"809","DOI":"10.1007\/978-3-319-46454-1_49","volume-title":"Computer Vision \u2013 ECCV 2016","author":"Z Bylinskii","year":"2016","unstructured":"Bylinskii, Z., Recasens, A., Borji, A., Oliva, A., Torralba, A., Durand, F.: Where Should Saliency Models Look Next? In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9909, pp. 809\u2013824. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46454-1_49"},{"key":"3_CR7","unstructured":"Chang, Q., Zhu, S.: Temporal-spatial feature pyramid for video saliency detection. arXiv preprint arXiv:2105.04213 (2021)"},{"key":"3_CR8","unstructured":"Chang, Q., Zhu, S.: Temporal-Spatial Feature Pyramid for Video Saliency Detection. arXiv e-prints arXiv:2105.04213 (May 2021). 10.48550\/arXiv.2105.04213"},{"key":"3_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"419","DOI":"10.1007\/978-3-030-58558-7_25","volume-title":"Computer Vision \u2013 ECCV 2020","author":"R Droste","year":"2020","unstructured":"Droste, R., Jiao, J., Noble, J.A.: Unified Image and Video Saliency Modeling. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12350, pp. 419\u2013435. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58558-7_25"},{"key":"3_CR10","doi-asserted-by":"crossref","unstructured":"Jain, S., Yarlagadda, P., Jyoti, S., Karthik, S., Subramanian, R., Gandhi, V.: Vinet: Pushing the limits of visual modality for audio-visual saliency prediction. In: IROS 2021. pp. 3520\u20133527 (2021)","DOI":"10.1109\/IROS51168.2021.9635989"},{"key":"3_CR11","unstructured":"Kay, W., Carreira, J., Simonyan, K., Zhang, B., Hillier, C., Vijayanarasimhan, S., Viola, F., Green, T., Back, T., Natsev, P., Suleyman, M., Zisserman, A.: The kinetics human action video dataset. CoRR abs\/1705.06950 (2017)"},{"key":"3_CR12","unstructured":"Kingma, D.P., Ba, J.: Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"3_CR13","doi-asserted-by":"publisher","first-page":"1113","DOI":"10.1109\/TIP.2019.2936112","volume":"29","author":"Q Lai","year":"2019","unstructured":"Lai, Q., Wang, W., Sun, H., Shen, J.: Video saliency prediction using spatiotemporal residual attentive networks. IEEE Trans. Image Process. 29, 1113\u20131126 (2019)","journal-title":"IEEE Trans. Image Process."},{"key":"3_CR14","doi-asserted-by":"crossref","unstructured":"Li, K., Wang, Y., He, Y., Li, Y., Wang, Y., Wang, L., Qiao, Y.: Uniformerv2: Unlocking the potential of image vits for video understanding. In: CVPR 2023. pp. 1632\u20131643 (2023)","DOI":"10.1109\/ICCV51070.2023.00157"},{"key":"3_CR15","doi-asserted-by":"crossref","unstructured":"Li, K., Wang, Y., Li, Y., Wang, Y., He, Y., Wang, L., Qiao, Y.: Unmasked teacher: Towards training-efficient video foundation models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). pp. 19948\u201319960 (October 2023)","DOI":"10.1109\/ICCV51070.2023.01826"},{"key":"3_CR16","unstructured":"Linardos, P., Mohedano, E., Nieto, J.J., O\u2019Connor, N.E., Giro-i Nieto, X., McGuinness, K.: Simple vs complex temporal recurrences for video saliency prediction. arXiv preprint arXiv:1907.01869 (2019)"},{"key":"3_CR17","doi-asserted-by":"crossref","unstructured":"Liu, Z., Ning, J., Cao, Y., Wei, Y., Zhang, Z., Lin, S., Hu, H.: Video swin transformer. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 3202\u20133211 (2022)","DOI":"10.1109\/CVPR52688.2022.00320"},{"issue":"10","key":"3_CR18","doi-asserted-by":"publisher","first-page":"6850","DOI":"10.1109\/TCSVT.2022.3172971","volume":"32","author":"C Ma","year":"2022","unstructured":"Ma, C., Sun, H., Rao, Y., Zhou, J., Lu, J.: Video saliency forecasting transformer. IEEE Trans. Circuits Syst. Video Technol. 32(10), 6850\u20136862 (2022). https:\/\/doi.org\/10.1109\/TCSVT.2022.3172971","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"7","key":"3_CR19","doi-asserted-by":"publisher","first-page":"1408","DOI":"10.1109\/TPAMI.2014.2366154","volume":"37","author":"S Mathe","year":"2014","unstructured":"Mathe, S., Sminchisescu, C.: Actions in the eye: Dynamic gaze datasets and learnt saliency models for visual recognition. IEEE Trans. Pattern Anal. Mach. Intell. 37(7), 1408\u20131424 (2014)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"7","key":"3_CR20","doi-asserted-by":"publisher","first-page":"1408","DOI":"10.1109\/TPAMI.2014.2366154","volume":"37","author":"S Mathe","year":"2014","unstructured":"Mathe, S., Sminchisescu, C.: Actions in the eye: Dynamic gaze datasets and learnt saliency models for visual recognition. IEEE Trans. Pattern Anal. Mach. Intell. 37(7), 1408\u20131424 (2014)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3_CR21","doi-asserted-by":"crossref","unstructured":"Min, K., Corso, J.J.: Tased-net: Temporally-aggregating spatial encoder-decoder network for video saliency detection. In: ICCV 2019 (2019)","DOI":"10.1109\/ICCV.2019.00248"},{"key":"3_CR22","doi-asserted-by":"publisher","unstructured":"Moradi., M., Palazzo., S., Spampinato., C.: Transformer-based video saliency prediction with high temporal dimension decoding. In: VISAPP 2024. SCITEPRESS (2024). https:\/\/doi.org\/10.5220\/0012422800003660","DOI":"10.5220\/0012422800003660"},{"key":"3_CR23","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML 2021. pp. 8748\u20138763 (2021)"},{"key":"3_CR24","unstructured":"Tong, Z., Song, Y., Wang, J., Wang, L.: Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. NeurIPS 2022 (2022)"},{"key":"3_CR25","doi-asserted-by":"crossref","unstructured":"Wang, L., Huang, B., Zhao, Z., Tong, Z., He, Y., Wang, Y., Wang, Y., Qiao, Y.: Videomae v2: Scaling video masked autoencoders with dual masking. In: CVPR 2023. pp. 14549\u201314560 (2023)","DOI":"10.1109\/CVPR52729.2023.01398"},{"issue":"1","key":"3_CR26","doi-asserted-by":"publisher","first-page":"220","DOI":"10.1109\/TPAMI.2019.2924417","volume":"43","author":"W Wang","year":"2019","unstructured":"Wang, W., Shen, J., Xie, J., Cheng, M.M., Ling, H., Borji, A.: Revisiting video saliency prediction in the deep learning era. IEEE Trans. Pattern Anal. Mach. Intell. 43(1), 220\u2013237 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"1","key":"3_CR27","doi-asserted-by":"publisher","first-page":"220","DOI":"10.1109\/TPAMI.2019.2924417","volume":"43","author":"W Wang","year":"2019","unstructured":"Wang, W., Shen, J., Xie, J., Cheng, M.M., Ling, H., Borji, A.: Revisiting video saliency prediction in the deep learning era. IEEE Trans. Pattern Anal. Mach. Intell. 43(1), 220\u2013237 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3_CR28","unstructured":"Wang, Y., Li, K., Li, Y., He, Y., Huang, B., Zhao, Z., Zhang, H., Xu, J., Liu, Y., Wang, Z., et\u00a0al.: Internvideo: General video foundation models via generative and discriminative learning. arXiv preprint arXiv:2212.03191 (2022)"},{"key":"3_CR29","unstructured":"Wang, Z., Liu, Z., Li, G., Wang, Y., Zhang, T., Xu, L., Wang, J.: Spatio-temporal self-attention network for video saliency prediction. IEEE Transactions on Multimedia (2021)"},{"key":"3_CR30","unstructured":"Wang, Z., Liu, Z., Li, G., Wang, Y., Zhang, T., Xu, L., Wang, J.: Spatio-temporal self-attention network for video saliency prediction. IEEE Transactions on Multimedia (2021)"},{"key":"3_CR31","doi-asserted-by":"crossref","unstructured":"Wu, X., Wu, Z., Zhang, J., Ju, L., Wang, S.: Salsac: A video saliency prediction model with shuffled attentions and correlation-based convlstm. In: AAAI 2020. pp. 12410\u201312417 (2020)","DOI":"10.1609\/aaai.v34i07.6927"},{"key":"3_CR32","doi-asserted-by":"crossref","unstructured":"Xia, Y., Zhang, D., Kim, J., Nakayama, K., Zipser, K., Whitney, D.: Predicting driver attention in critical situations. In: ACCV 2018. pp. 658\u2013674 (2019)","DOI":"10.1007\/978-3-030-20873-8_42"},{"key":"3_CR33","unstructured":"Xue, H., Sun, Y., Liu, B., Fu, J., Song, R., Li, H., Luo, J.: Clip-vip: Adapting pre-trained image-text model to video-language representation alignment. arXiv preprint arXiv:2209.06430 (2022)"},{"key":"3_CR34","unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., Wu, Y.: Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)"},{"key":"3_CR35","unstructured":"Yuan, L., Chen, D., Chen, Y.L., Codella, N., Dai, X., Gao, J., Hu, H., Huang, X., Li, B., Li, C., et\u00a0al.: Florence: A new foundation model for computer vision. arXiv preprint arXiv:2111.11432 (2021)"},{"issue":"12","key":"3_CR36","doi-asserted-by":"publisher","first-page":"7696","DOI":"10.1109\/TCSVT.2023.3278410","volume":"33","author":"X Zhou","year":"2023","unstructured":"Zhou, X., Wu, S., Shi, R., Zheng, B., Wang, S., Yin, H., Zhang, J., Yan, C.: Transformer-based multi-scale feature integration network for video saliency prediction. IEEE Trans. Circuits Syst. Video Technol. 33(12), 7696\u20137707 (2023)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78312-8_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T13:03:31Z","timestamp":1733231011000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78312-8_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,4]]},"ISBN":["9783031783111","9783031783128"],"references-count":36,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78312-8_3","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,4]]},"assertion":[{"value":"4 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kolkata","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icpr2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icpr2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}