{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,4]],"date-time":"2025-09-04T13:49:17Z","timestamp":1756993757646,"version":"3.28.0"},"reference-count":69,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,8,21]],"date-time":"2022-08-21T00:00:00Z","timestamp":1661040000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,8,21]],"date-time":"2022-08-21T00:00:00Z","timestamp":1661040000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,8,21]]},"DOI":"10.1109\/icpr56361.2022.9956304","type":"proceedings-article","created":{"date-parts":[[2022,11,29]],"date-time":"2022-11-29T19:34:13Z","timestamp":1669750453000},"page":"3706-3713","source":"Crossref","is-referenced-by-count":1,"title":["Forecasting of depth and ego-motion with transformers and self-supervision"],"prefix":"10.1109","author":[{"given":"Houssem Eddine","family":"Boulahbal","sequence":"first","affiliation":[{"name":"C&#x00F4;te d&#x2019;Azur University,Renault Software Factory and CNRS-I3S"}]},{"given":"Adrian","family":"Voicila","sequence":"additional","affiliation":[{"name":"Renault Software Factory"}]},{"given":"Andrew I.","family":"Comport","sequence":"additional","affiliation":[{"name":"C&#x00F4;te d&#x2019;Azur University,CNRS-I3S"}]}],"member":"263","reference":[{"article-title":"Are we ready for a new paradigm shift? a survey on visual deep mlp","year":"2021","author":"liu","key":"ref39"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2505283"},{"article-title":"Adam: A method for stochastic optimization","year":"2014","author":"kingma","key":"ref33"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.336"},{"key":"ref31","first-page":"2017","article-title":"Spatial transformer networks","volume":"2015 janua","author":"jaderberg","year":"2015","journal-title":"Advances in neural information processing systems"},{"key":"ref30","first-page":"2017","article-title":"Spatial transformer networks","volume":"28","author":"jaderberg","year":"2015","journal-title":"Advances in neural information processing systems"},{"article-title":"Improved multiscale vision transformers for classification and detection","year":"2021","author":"li","key":"ref37"},{"article-title":"Grounded language-image pre-training","year":"2021","author":"li","key":"ref36"},{"article-title":"Videoflow: A conditional flow-based model for stochastic video generation","year":"2019","author":"kumar","key":"ref35"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_35"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01249"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00122"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2003.819861"},{"article-title":"Inverting the pose forecasting pipeline with spf2: Sequential pointcloud forecasting for sequential pose forecasting","year":"2020","author":"weng","key":"ref63"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/3467017"},{"key":"ref64","first-page":"64","article-title":"Deep Predictive Coding Networks for Video Prediction and Unsupervised Learning","author":"william lotter","year":"2016","journal-title":"Advances in neural information processing systems"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12257"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00212"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58517-4_45"},{"article-title":"Scaling vision transformers","year":"2021","author":"zhai","key":"ref67"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00043"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.700"},{"key":"ref2","article-title":"Stochastic variational video prediction","author":"babaeizadeh","year":"2018","journal-title":"6th International Conference on Learning Representations ICLR 2018 -Conference Track Proceedings"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8793512"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248074"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.699"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00393"},{"key":"ref24","first-page":"2279","article-title":"Panoptic segmentation forecasting","author":"graber","year":"2021","journal-title":"IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00907"},{"key":"ref26","first-page":"2360","article-title":"Pmhuber: Patchmatch with huber regularization for stereo matching","author":"heise","year":"2013","journal-title":"Proceedings of the IEEE International Conference on Computer Vision"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.5220\/0010241801010112"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01252"},{"article-title":"Predicting video with vqvae","year":"2021","author":"walker","key":"ref59"},{"key":"ref58","article-title":"Attention is all you need","volume":"30","author":"vaswani","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2019.00186"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2012.6385773"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58529-7_34"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01066"},{"key":"ref52","first-page":"2482","article-title":"3D packing for self-supervised monocular depth estimation","author":"rares","year":"2020","journal-title":"Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00716"},{"article-title":"Masked-attention mask transformer for universal image segmentation","year":"2021","author":"cheng","key":"ref11"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2020.2992184"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref14","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","volume":"1(mlm)","author":"devlin","year":"2019","journal-title":"NAACL HLT 2019 - 2019 Conference of the North American Chapter of the Association for Computational Linguistics Human Language Technologies - Proceedings of the Conference"},{"article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","year":"2020","author":"dosovitskiy","key":"ref15"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.304"},{"key":"ref17","first-page":"2366","article-title":"Depth map prediction from a single image using a multi-scale deep network","volume":"3","author":"eigen","year":"2014","journal-title":"Advances in neural information processing systems"},{"key":"ref18","first-page":"64","article-title":"Unsupervised learning for physical interaction through video prediction","author":"finn","year":"2016","journal-title":"Advances in neural information processing systems"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_45"},{"key":"ref4","first-page":"35","article-title":"Unsupervised scale-consistent depth and ego-motion learning from monocular video","volume":"32","author":"bian","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref3","article-title":"Bayesian prediction of future street scenes using synthetic likelihoods","author":"bhattacharyya","year":"2019","journal-title":"7th International Conference on Learning Representations ICLR 2019"},{"article-title":"Are conditional gans explicitly conditional?","year":"2021","author":"boulahbal","key":"ref6"},{"article-title":"On the opportunities and risks of foundation models","year":"2021","author":"bommasani","key":"ref5"},{"key":"ref8","first-page":"213","article-title":"End-to-end object detection with transformers","author":"carion","year":"2020","journal-title":"European Conference on Computer Vision"},{"key":"ref7","article-title":"Language models are few-shot learners","author":"brown","year":"2020","journal-title":"Advances in Neural Information Processing Systems 2020-December"},{"article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","year":"2019","author":"raffel","key":"ref49"},{"key":"ref9","first-page":"730","article-title":"Single-image depth perception in the wild","volume":"29","author":"chen","year":"2016","journal-title":"Advances in neural information processing systems"},{"key":"ref46","first-page":"8024","article-title":"Pytorch: An imperative style, high-performance deep learning library","author":"paszke","year":"2019","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref45","doi-asserted-by":"crossref","first-page":"1147","DOI":"10.1109\/TRO.2015.2463671","article-title":"Orb-slam: a versatile and accurate monocular slam system","volume":"31","author":"mur-artal","year":"2015","journal-title":"IEEE Transactions on Robotics"},{"article-title":"Learning transferable visual models from natural language supervision","year":"2021","author":"radford","key":"ref48"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00786"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/IVS.2017.7995953"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.77"},{"article-title":"Deep multi-scale video prediction beyond mean square error","year":"2015","author":"mathieu","key":"ref44"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00594"}],"event":{"name":"2022 26th International Conference on Pattern Recognition (ICPR)","start":{"date-parts":[[2022,8,21]]},"location":"Montreal, QC, Canada","end":{"date-parts":[[2022,8,25]]}},"container-title":["2022 26th International Conference on Pattern Recognition (ICPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9956007\/9955631\/09956304.pdf?arnumber=9956304","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,30]],"date-time":"2023-01-30T20:05:14Z","timestamp":1675109114000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9956304\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,8,21]]},"references-count":69,"URL":"https:\/\/doi.org\/10.1109\/icpr56361.2022.9956304","relation":{},"subject":[],"published":{"date-parts":[[2022,8,21]]}}}