{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,11]],"date-time":"2025-03-11T04:19:44Z","timestamp":1741666784463,"version":"3.38.0"},"reference-count":54,"publisher":"SAGE Publications","issue":"2","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IA"],"published-print":{"date-parts":[[2019,1,29]]},"DOI":"10.3233\/ia-170033","type":"journal-article","created":{"date-parts":[[2019,1,29]],"date-time":"2019-01-29T16:53:24Z","timestamp":1548780804000},"page":"161-175","source":"Crossref","is-referenced-by-count":0,"title":["Attentive models in vision: Computing saliency maps in the deep learning era"],"prefix":"10.1177","volume":"12","author":[{"given":"Marcella","family":"Cornia","sequence":"first","affiliation":[{"name":"Department of Engineering \u201cEnzo Ferrari\u201d, University of Modena and Reggio Emilia"}]},{"given":"Davide","family":"Abati","sequence":"additional","affiliation":[{"name":"Department of Engineering \u201cEnzo Ferrari\u201d, University of Modena and Reggio Emilia"}]},{"given":"Lorenzo","family":"Baraldi","sequence":"additional","affiliation":[{"name":"Department of Engineering \u201cEnzo Ferrari\u201d, University of Modena and Reggio Emilia"}]},{"given":"Andrea","family":"Palazzi","sequence":"additional","affiliation":[{"name":"Department of Engineering \u201cEnzo Ferrari\u201d, University of Modena and Reggio Emilia"}]},{"given":"Simone","family":"Calderara","sequence":"additional","affiliation":[{"name":"Department of Engineering \u201cEnzo Ferrari\u201d, University of Modena and Reggio Emilia"}]},{"given":"Rita","family":"Cucchiara","sequence":"additional","affiliation":[{"name":"Department of Engineering \u201cEnzo Ferrari\u201d, University of Modena and Reggio Emilia"}]}],"member":"179","reference":[{"key":"10.3233\/IA-170033_ref1","doi-asserted-by":"crossref","unstructured":"Alletto S. , Palazzi A. , Solera F. , Calderara S. and Cucchiara R. , DR(eye)VE: A Dataset for Attention-Based Tasks with Applications to Autonomous and Assisted Driving, In IEEE International Conference on Computer Vision and Pattern Recognition Workshops, 2016.","DOI":"10.1109\/CVPRW.2016.14"},{"key":"10.3233\/IA-170033_ref2","doi-asserted-by":"crossref","unstructured":"Baraldi L. , Grana C. and Cucchiara R. , A deep siamese network for scene detection in broadcast videos, In ACM International Conference on Multimedia, 2015.","DOI":"10.1145\/2733373.2806316"},{"issue":"5","key":"10.3233\/IA-170033_ref3","doi-asserted-by":"crossref","first-page":"955","DOI":"10.1109\/TMM.2016.2644872","article-title":"Recognizing and presenting the storytelling video structure with deep multimodal networks","volume":"19","author":"Baraldi","year":"2017","journal-title":"IEEE Transactions on Multimedia"},{"key":"10.3233\/IA-170033_ref4","unstructured":"Bazzani L. , Larochelle H. and Torresani L. , Recurrent mixture density network for spatiotemporal visual attention, In International Conference on Learning Representations, 2017."},{"key":"10.3233\/IA-170033_ref5","first-page":"155","article-title":"Saliency based on information maximization","author":"Bruce","year":"2005","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.3233\/IA-170033_ref6","unstructured":"Bylinskii Z. , Judd T. , Borji A. , Itti L. , Durand F. , Oliva A. and Torralba A. , Mit saliency benchmark, http:\/\/saliency.mit.edu\/"},{"key":"10.3233\/IA-170033_ref7","unstructured":"Bylinskii Z. , Judd T. , Oliva A. , Torralba A. and Durand F. , What do different evaluation metrics tell us about saliency models? arXiv preprint arXiv:1604.03605, 2016."},{"key":"10.3233\/IA-170033_ref8","doi-asserted-by":"crossref","unstructured":"Chen L. , Zhang H. , Xiao J. , Nie L. , Shao J. , Liu W. and Chua T.-S. , SCA-CNN: Spatial and Channel-Wise Attention in Convolutional Networks for Image Captioning, In IEEE International Conference on Computer Vision and Pattern Recognition, 2017.","DOI":"10.1109\/CVPR.2017.667"},{"key":"10.3233\/IA-170033_ref9","doi-asserted-by":"crossref","unstructured":"Cornia M. , Abati D. , Baraldi L. , Palazzi A. , Calderara S. and Cucchiara R. , Attentive Models in Vision: Computing Saliency Maps in the Deep Learning Era, In Conference of the Italian Association for Artificial Intelligence, Springer, 2017, pp. 387\u2013399.","DOI":"10.1007\/978-3-319-70169-1_29"},{"key":"10.3233\/IA-170033_ref10","doi-asserted-by":"crossref","unstructured":"Cornia M. , Baraldi L. , Serra G. and Cucchiara R. , A Deep Multi-Level Network for Saliency Prediction, In International Conference on Pattern Recognition, 2016.","DOI":"10.1109\/ICPR.2016.7900174"},{"key":"10.3233\/IA-170033_ref11","doi-asserted-by":"crossref","unstructured":"Cornia M. , Baraldi L. , Serra G. and Cucchiara R. , Visual Saliency for Image Captioning in New Multimedia Services, In IEEE International Conference on Multimedia and Expo Workshops, 2017.","DOI":"10.1109\/ICMEW.2017.8026277"},{"issue":"2","key":"10.3233\/IA-170033_ref12","doi-asserted-by":"crossref","first-page":"48","DOI":"10.1145\/3177745","article-title":"Paying more attention to saliency: Image captioning with saliency and context attention","volume":"14","author":"Cornia","year":"2018","journal-title":"ACM Transactions on Multimedia Computing, Communications and Applications"},{"key":"10.3233\/IA-170033_ref13","doi-asserted-by":"crossref","unstructured":"Cornia M. , Baraldi L. , Serra G. and Cucchiara R. , Predicting human eye fixations via an LSTMbased saliency attentive model, IEEE Transactions on Image Processing (2018).","DOI":"10.1109\/TIP.2018.2851672"},{"key":"10.3233\/IA-170033_ref14","doi-asserted-by":"crossref","unstructured":"Cornia M. , Baraldi L. , Serra G. and Cucchiara R. , SAM: Pushing the Limits of Saliency Prediction Models, In IEEE International Conference on Computer Vision and Pattern Recognition Workshops, 2018.","DOI":"10.1109\/CVPRW.2018.00250"},{"key":"10.3233\/IA-170033_ref15","doi-asserted-by":"crossref","unstructured":"Cornia M. , Pini S. , Baraldi L. and Cucchiara R. , Automatic image cropping and selection using saliency: An application to historical manuscripts, In Digital Libraries and Multimedia Archives 806 (2018).","DOI":"10.1007\/978-3-319-73165-0_17"},{"key":"10.3233\/IA-170033_ref16","doi-asserted-by":"crossref","unstructured":"Greenspan H. , Belongie S. , Goodman R. , Perona P. , Rakshit S. and Anderson C.H. , Over complete steerable pyramid filters and rotation invariance, In IEEE International Conference on Computer Vision and Pattern Recognition, 1994.","DOI":"10.1109\/CVPR.1994.323833"},{"issue":"1","key":"10.3233\/IA-170033_ref17","doi-asserted-by":"crossref","first-page":"19","DOI":"10.1109\/TIP.2013.2282897","article-title":"Saliency-aware video compression","volume":"23","author":"Hadizadeh","year":"2014","journal-title":"IEEE Transactions on Image Processing"},{"key":"10.3233\/IA-170033_ref18","first-page":"545","article-title":"Graphbased visual saliency","author":"Harel","year":"2006","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.3233\/IA-170033_ref19","doi-asserted-by":"crossref","unstructured":"He K. , Zhang X. , Ren S. and Sun J. , Deep residual learning for image recognition, In IEEE International Conference on Computer Vision and Pattern Recognition, 2016.","DOI":"10.1109\/CVPR.2016.90"},{"issue":"8","key":"10.3233\/IA-170033_ref20","doi-asserted-by":"crossref","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","article-title":"Long short-term memory","volume":"9","author":"Hochreiter","year":"1997","journal-title":"Neural Computation"},{"key":"10.3233\/IA-170033_ref21","doi-asserted-by":"crossref","unstructured":"Huang X. , Shen C. , Boix X. and Zhao Q. , SALICON: Reducing the Semantic Gap in Saliency Prediction by Adapting Deep Neural Networks, In IEEE International Conference on Computer Vision, 2015.","DOI":"10.1109\/ICCV.2015.38"},{"issue":"3","key":"10.3233\/IA-170033_ref22","doi-asserted-by":"crossref","first-page":"194","DOI":"10.1038\/35058500","article-title":"Computational modelling of visual attention","volume":"2","author":"Itti","year":"2001","journal-title":"Nature Reviews Neuroscience"},{"issue":"11","key":"10.3233\/IA-170033_ref23","doi-asserted-by":"crossref","first-page":"1254","DOI":"10.1109\/34.730558","article-title":"A model of saliency-based visual attention for rapid scene analysis","volume":"20","author":"Itti","year":"1998","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.3233\/IA-170033_ref24","doi-asserted-by":"crossref","unstructured":"Jetley S. , Murray N. and Vig E. , End-to-end saliency mapping via probability distribution prediction, In IEEE International Conference on Computer Vision and Pattern Recognition, 2016.","DOI":"10.1109\/CVPR.2016.620"},{"issue":"1","key":"10.3233\/IA-170033_ref25","doi-asserted-by":"crossref","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","article-title":"3d convolutional neural networks for human action recognition","volume":"35","author":"Ji","year":"2013","journal-title":"IEEE transactions on pattern analysis and machine intelligence"},{"key":"10.3233\/IA-170033_ref26","doi-asserted-by":"crossref","unstructured":"Jiang M. , Huang S. , Duan J. and Zhao Q. , Salicon: Saliency in context, In IEEE International Conference on Computer Vision and Pattern Recognition, 2015.","DOI":"10.1109\/CVPR.2015.7298710"},{"key":"10.3233\/IA-170033_ref27","unstructured":"Judd T. , Durand F. and Torralba A. , A benchmark of computational models of saliency to predict human fixations, In MIT Technical Report, 2012."},{"key":"10.3233\/IA-170033_ref28","doi-asserted-by":"crossref","unstructured":"Judd T. , Ehinger K. , Durand F. and Torralba A. , Learning to predict where humans look, In IEEE International Conference on Computer Vision, 2009.","DOI":"10.1109\/ICCV.2009.5459462"},{"key":"10.3233\/IA-170033_ref29","unstructured":"Kingma D. and Ba J. , Adam: A method for stochastic optimization, In International Conference on Learning Representations, 2015."},{"key":"10.3233\/IA-170033_ref30","doi-asserted-by":"crossref","first-page":"115","DOI":"10.1007\/978-94-009-3833-5_5","volume-title":"Matters of intelligence","author":"Koch","year":"1987"},{"key":"10.3233\/IA-170033_ref31","doi-asserted-by":"crossref","first-page":"121","DOI":"10.1016\/j.jneumeth.2014.01.032","article-title":"A nonparametric method for detecting fixations and saccades using cluster analysis: Removing the need for arbitrary thresholds","volume":"227","author":"K\u00f6nig","year":"2014","journal-title":"Journal of neuroscience methods"},{"issue":"9","key":"10.3233\/IA-170033_ref32","doi-asserted-by":"crossref","first-page":"4446","DOI":"10.1109\/TIP.2017.2710620","article-title":"Deepfix: A fully convolutional neural network for predicting human eye fixations","volume":"26","author":"Kruthiventi","year":"2017","journal-title":"IEEE Transactions on Image Processing"},{"key":"10.3233\/IA-170033_ref33","doi-asserted-by":"crossref","unstructured":"Kruthiventi S.S.S. , Gudisa V. , Dholakiya J.H. and Babu R.V. , Saliency unified: A deep architecture for simultaneous eye fixation prediction and salient object segmentation, In IEEE International Conference on Computer Vision and Pattern Recognition, 2016.","DOI":"10.1109\/CVPR.2016.623"},{"key":"10.3233\/IA-170033_ref34","unstructured":"K\u00fcmmerer M. , Theis L. and Bethge M. , DeepGaze I: Boosting saliency prediction with feature maps trained on ImageNet, In ICLR Workshops, 2015."},{"key":"10.3233\/IA-170033_ref35","doi-asserted-by":"crossref","unstructured":"Lin T.-Y. , Maire M. , Belongie S. , Hays J. , Perona P. , Ramanan D. , Doll\u00e1r P. and Zitnick C.L. , Microsoft coco: Common objects in context, In European Conference on Computer Vision, 2014.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"10.3233\/IA-170033_ref36","doi-asserted-by":"crossref","unstructured":"Liu S. , Zhu Z. , Ye N. , Guadarrama S. and Murphy K. , Improved Image Captioning via Policy Gradient Optimization of SPIDEr, In IEEE International Conference on Computer Vision, 2017.","DOI":"10.1109\/ICCV.2017.100"},{"key":"10.3233\/IA-170033_ref37","doi-asserted-by":"crossref","unstructured":"Mathe S. and Sminchisescu C. , Actions in the eye: Dynamic gaze datasets and learnt saliency models for visual recognition, IEEE Transactions on Pattern Analysis and Machine Intelligence 37, 2015.","DOI":"10.1109\/TPAMI.2014.2366154"},{"key":"10.3233\/IA-170033_ref38","doi-asserted-by":"crossref","unstructured":"Palazzi A. , Abati D. , Calderara S. , Solera F. and Cucchiara R. , Predicting the Driver\u2019s Focus of Attention: The DR (eye) VE Project, IEEE Transactions on Pattern Analysis and Machine Intelligence, 2018.","DOI":"10.1109\/TPAMI.2018.2845370"},{"key":"10.3233\/IA-170033_ref39","doi-asserted-by":"crossref","unstructured":"Palazzi A. , Solera F. , Calderara S. , Alletto S. and Cucchiara R. , Learning to attend like a human driver, In Intelligent Vehicles Symposium, 2017.","DOI":"10.1109\/IVS.2017.7995833"},{"key":"10.3233\/IA-170033_ref40","doi-asserted-by":"crossref","unstructured":"Pan J. , McGuinness K. , Sayrol E. , O\u2019Connor N. and Gir&rsquo;po-i X. , Nieto, Shallow and Deep Convolutional Networks for Saliency Prediction, In IEEE International Conference on Computer Vision and Pattern Recognition, 2016.","DOI":"10.1109\/CVPR.2016.71"},{"issue":"18","key":"10.3233\/IA-170033_ref41","doi-asserted-by":"crossref","first-page":"2397","DOI":"10.1016\/j.visres.2005.03.019","article-title":"Components of bottom-up gaze allocation in natural images","volume":"45","author":"Peters","year":"2005","journal-title":"Vision research"},{"key":"10.3233\/IA-170033_ref42","doi-asserted-by":"crossref","unstructured":"Rudoy D. , Goldman D.B. , Shechtman E. and Zelnik-Manor L. , Learning video saliency from human gaze using candidate selection, In IEEE International Conference on Computer Vision and Pattern Recognition, 2013.","DOI":"10.1109\/CVPR.2013.152"},{"key":"10.3233\/IA-170033_ref43","unstructured":"Simonyan K. and Zisserman A. , Very deep convolutional networks for large-scale image recognition, CoRR, abs\/1409.1556, 2014."},{"key":"10.3233\/IA-170033_ref44","doi-asserted-by":"crossref","unstructured":"Tran D. , Bourdev L. , Fergus R. , Torresani L. and Paluri M. , Learning spatiotemporal features with 3d convolutional networks, In IEEE International Conference on Computer Vision, 2015.","DOI":"10.1109\/ICCV.2015.510"},{"issue":"1","key":"10.3233\/IA-170033_ref45","doi-asserted-by":"crossref","first-page":"97","DOI":"10.1016\/0010-0285(80)90005-5","article-title":"A feature-integration theory of attention","volume":"12","author":"Treisman","year":"1980","journal-title":"Cognitive psychology"},{"key":"10.3233\/IA-170033_ref46","doi-asserted-by":"crossref","unstructured":"Vig E. , Dorr M. and Cox D. , Large-scale optimization of hierarchical features for saliency prediction in natural images, In IEEE International Conference on Computer Vision and Pattern Recognition, 2014.","DOI":"10.1109\/CVPR.2014.358"},{"key":"10.3233\/IA-170033_ref47","unstructured":"Wang W. , Shen J. and Porikli F. , Saliencyaware geodesic video object segmentation, In IEEE International Conference on Computer Vision and Pattern Recognition, 2015."},{"issue":"11","key":"10.3233\/IA-170033_ref48","doi-asserted-by":"crossref","first-page":"4185","DOI":"10.1109\/TIP.2015.2460013","article-title":"Consistent video saliency using local gradient flow optimization and global refinement","volume":"24","author":"Wang","year":"2015","journal-title":"IEEE Transactions on Image Processing"},{"key":"10.3233\/IA-170033_ref49","unstructured":"Xu K. , Ba J. , Kiros R. , Cho K. , Courville A. , Salakhutdinov R. , Zemel R.S. and Bengio Y. , Show, attend and tell: Neural image caption generation with visual attention, In International Conference on Machine Learning, 2015."},{"key":"10.3233\/IA-170033_ref50","doi-asserted-by":"crossref","unstructured":"You Q. , Jin H. , Wang Z. , Fang C. and Luo J. , Image captioning with semantic attention, In IEEE International Conference on Computer Vision and Pattern Recognition, 2016.","DOI":"10.1109\/CVPR.2016.503"},{"key":"10.3233\/IA-170033_ref51","doi-asserted-by":"crossref","unstructured":"Zeiler M.D. and Fergus R. , Visualizing and understanding convolutional networks, In European Conference on Computer Vision, 2014.","DOI":"10.1007\/978-3-319-10590-1_53"},{"key":"10.3233\/IA-170033_ref52","doi-asserted-by":"crossref","unstructured":"Zhai Y. and Shah M. , Visual attention detection in video sequences using spatiotemporal cues, In ACM International Conference on Multimedia, 2006.","DOI":"10.1145\/1180639.1180824"},{"key":"10.3233\/IA-170033_ref53","doi-asserted-by":"crossref","unstructured":"Zhang J. and Sclaroff S. , Saliency detection: A boolean map approach, In IEEE International Conference on Computer Vision, 2013.","DOI":"10.1109\/ICCV.2013.26"},{"key":"10.3233\/IA-170033_ref54","doi-asserted-by":"crossref","unstructured":"Zhong S.-H. , Liu Y. , Ren F. , Zhang J. and Ren T. , Video saliency detection via dynamic consistent spatio-temporal attention modelling, In AAAI, 2013.","DOI":"10.1609\/aaai.v27i1.8642"}],"container-title":["Intelligenza Artificiale"],"original-title":[],"link":[{"URL":"https:\/\/content.iospress.com\/download?id=10.3233\/IA-170033","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,10]],"date-time":"2025-03-10T14:46:33Z","timestamp":1741617993000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.medra.org\/servlet\/aliasResolver?alias=iospress&doi=10.3233\/IA-170033"}},"subtitle":[],"editor":[{"given":"Stefano","family":"Ferilli","sequence":"additional","affiliation":[]},{"given":"Francesca Alessandra","family":"Lisi","sequence":"additional","affiliation":[]}],"short-title":[],"issued":{"date-parts":[[2019,1,29]]},"references-count":54,"journal-issue":{"issue":"2"},"URL":"https:\/\/doi.org\/10.3233\/ia-170033","relation":{},"ISSN":["1724-8035","2211-0097"],"issn-type":[{"type":"print","value":"1724-8035"},{"type":"electronic","value":"2211-0097"}],"subject":[],"published":{"date-parts":[[2019,1,29]]}}}