{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,12,12]],"date-time":"2024-12-12T05:11:29Z","timestamp":1733980289772,"version":"3.30.2"},"reference-count":46,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62272083","61876030","62102061"],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Computer Vision and Image Understanding"],"published-print":{"date-parts":[[2025,1]]},"DOI":"10.1016\/j.cviu.2024.104209","type":"journal-article","created":{"date-parts":[[2024,11,5]],"date-time":"2024-11-05T02:05:40Z","timestamp":1730772340000},"page":"104209","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Bilevel progressive homography estimation via correlative region-focused transformer"],"prefix":"10.1016","volume":"250","author":[{"given":"Qi","family":"Jia","sequence":"first","affiliation":[]},{"given":"Xiaomei","family":"Feng","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8536-2948","authenticated-orcid":false,"given":"Wei","family":"Zhang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2067-9175","authenticated-orcid":false,"given":"Yu","family":"Liu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2179-8301","authenticated-orcid":false,"given":"Nan","family":"Pu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6597-7248","authenticated-orcid":false,"given":"Nicu","family":"Sebe","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"issue":"12","key":"10.1016\/j.cviu.2024.104209_b1","doi-asserted-by":"crossref","first-page":"2481","DOI":"10.1109\/TPAMI.2016.2644615","article-title":"Segnet: A deep convolutional encoder-decoder architecture for image segmentation","volume":"39","author":"Badrinarayanan","year":"2017","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.cviu.2024.104209_b2","doi-asserted-by":"crossref","unstructured":"Barath,\u00a0D., Matas,\u00a0J., Noskova,\u00a0J., 2019. MAGSAC: marginalizing sample consensus. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 10197\u201310205.","DOI":"10.1109\/CVPR.2019.01044"},{"key":"10.1016\/j.cviu.2024.104209_b3","doi-asserted-by":"crossref","unstructured":"Barath,\u00a0D., Noskova,\u00a0J., Ivashechkin,\u00a0M., Matas,\u00a0J., 2020. MAGSAC++, a fast, reliable and accurate robust estimator. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 1304\u20131312.","DOI":"10.1109\/CVPR42600.2020.00138"},{"key":"10.1016\/j.cviu.2024.104209_b4","series-title":"Computer Vision\u2013ECCV 2006: 9th European Conference on Computer Vision, Graz, Austria, May 7-13, 2006. Proceedings, Part I 9","first-page":"404","article-title":"Surf: Speeded up robust features","author":"Bay","year":"2006"},{"key":"10.1016\/j.cviu.2024.104209_b5","doi-asserted-by":"crossref","unstructured":"Cao,\u00a0S.Y., Hu,\u00a0J., Sheng,\u00a0Z., Shen,\u00a0H.L., 2022. Iterative deep homography estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 1879\u20131888.","DOI":"10.1109\/CVPR52688.2022.00192"},{"key":"10.1016\/j.cviu.2024.104209_b6","doi-asserted-by":"crossref","unstructured":"Cao,\u00a0Z., Simon,\u00a0T., Wei,\u00a0S.E., Sheikh,\u00a0Y., 2017. Realtime multi-person 2d pose estimation using part affinity fields. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 7291\u20137299.","DOI":"10.1109\/CVPR.2017.143"},{"key":"10.1016\/j.cviu.2024.104209_b7","doi-asserted-by":"crossref","unstructured":"Cao,\u00a0S.Y., Zhang,\u00a0R., Luo,\u00a0L., Yu,\u00a0B., Sheng,\u00a0Z., Li,\u00a0J., Shen,\u00a0H.L., 2023. Recurrent Homography Estimation Using Homography-Guided Image Warping and Focus Transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 9833\u20139842.","DOI":"10.1109\/CVPR52729.2023.00948"},{"year":"2016","series-title":"Deep image homography estimation","author":"DeTone","key":"10.1016\/j.cviu.2024.104209_b8"},{"key":"10.1016\/j.cviu.2024.104209_b9","doi-asserted-by":"crossref","unstructured":"DeTone,\u00a0D., Malisiewicz,\u00a0T., Rabinovich,\u00a0A., 2018. Superpoint: Self-supervised interest point detection and description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops. pp. 224\u2013236.","DOI":"10.1109\/CVPRW.2018.00060"},{"year":"2020","series-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","key":"10.1016\/j.cviu.2024.104209_b10"},{"key":"10.1016\/j.cviu.2024.104209_b11","series-title":"European Conference on Computer Vision","first-page":"834","article-title":"LSD-SLAM: Large-scale direct monocular SLAM","author":"Engel","year":"2014"},{"key":"10.1016\/j.cviu.2024.104209_b12","doi-asserted-by":"crossref","unstructured":"Erlik\u00a0Nowruzi,\u00a0F., Laganiere,\u00a0R., Japkowicz,\u00a0N., 2017. Homography estimation from image pairs with hierarchical convolutional networks. In: Proceedings of the IEEE International Conference on Computer Vision Workshops. pp. 913\u2013920.","DOI":"10.1109\/ICCVW.2017.111"},{"issue":"6","key":"10.1016\/j.cviu.2024.104209_b13","doi-asserted-by":"crossref","first-page":"381","DOI":"10.1145\/358669.358692","article-title":"Random sample consensus: a paradigm for model fitting with applications to image analysis and automated cartography","volume":"24","author":"Fischler","year":"1981","journal-title":"Commun. ACM"},{"year":"2003","series-title":"Multiple View Geometry in Computer Vision","author":"Hartley","key":"10.1016\/j.cviu.2024.104209_b14"},{"key":"10.1016\/j.cviu.2024.104209_b15","series-title":"Intelligent Information and Database Systems: 12th Asian Conference, ACIIDS 2020, Phuket, Thailand, March 23\u201326, 2020, Proceedings, Part II 12","first-page":"141","article-title":"Deep feature extraction for panoramic image stitching","author":"Hoang","year":"2020"},{"issue":"9","key":"10.1016\/j.cviu.2024.104209_b16","doi-asserted-by":"crossref","first-page":"813","DOI":"10.1080\/03610927708827533","article-title":"Robust regression using iteratively reweighted least-squares","volume":"6","author":"Holland","year":"1977","journal-title":"Commun. Stat. Theory Methods"},{"issue":"1","key":"10.1016\/j.cviu.2024.104209_b17","first-page":"1","article-title":"Learning pixel-wise alignment for unsupervised image stitching","volume":"1","author":"Jia","year":"2023","journal-title":"Network"},{"key":"10.1016\/j.cviu.2024.104209_b18","doi-asserted-by":"crossref","unstructured":"Jia,\u00a0Q., Li,\u00a0Z., Fan,\u00a0X., Zhao,\u00a0H., Teng,\u00a0S., Ye,\u00a0X., Latecki,\u00a0L.J., 2021. Leveraging line-point consistence to preserve structures for wide parallax image stitching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 12186\u201312195.","DOI":"10.1109\/CVPR46437.2021.01201"},{"key":"10.1016\/j.cviu.2024.104209_b19","doi-asserted-by":"crossref","unstructured":"Le,\u00a0H., Liu,\u00a0F., Zhang,\u00a0S., Agarwala,\u00a0A., 2020. Deep homography estimation for dynamic scenes. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 7652\u20137661.","DOI":"10.1109\/CVPR42600.2020.00767"},{"key":"10.1016\/j.cviu.2024.104209_b20","doi-asserted-by":"crossref","first-page":"5545","DOI":"10.1109\/TIP.2021.3086079","article-title":"Image stitching based on semantic planar region consensus","volume":"30","author":"Li","year":"2021","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.cviu.2024.104209_b21","doi-asserted-by":"crossref","first-page":"724","DOI":"10.1109\/TIP.2019.2934344","article-title":"Single-perspective warps in natural image stitching","volume":"29","author":"Liao","year":"2019","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.cviu.2024.104209_b22","series-title":"Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13","first-page":"740","article-title":"Microsoft coco: Common objects in context","author":"Lin","year":"2014"},{"key":"10.1016\/j.cviu.2024.104209_b23","doi-asserted-by":"crossref","unstructured":"Lindenberger,\u00a0P., Sarlin,\u00a0P.E., Pollefeys,\u00a0M., 2023. Lightglue: Local feature matching at light speed. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 17627\u201317638.","DOI":"10.1109\/ICCV51070.2023.01616"},{"key":"10.1016\/j.cviu.2024.104209_b24","doi-asserted-by":"crossref","unstructured":"Liu,\u00a0Z., Lin,\u00a0Y., Cao,\u00a0Y., Hu,\u00a0H., Wei,\u00a0Y., Zhang,\u00a0Z., Lin,\u00a0S., Guo,\u00a0B., 2021. Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 10012\u201310022.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"10.1016\/j.cviu.2024.104209_b25","series-title":"Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, the Netherlands, October 11\u201314, 2016, Proceedings, Part VI 14","first-page":"800","article-title":"Meshflow: Minimum latency online video stabilization","author":"Liu","year":"2016"},{"year":"2017","series-title":"Decoupled weight decay regularization","author":"Loshchilov","key":"10.1016\/j.cviu.2024.104209_b26"},{"key":"10.1016\/j.cviu.2024.104209_b27","doi-asserted-by":"crossref","first-page":"91","DOI":"10.1023\/B:VISI.0000029664.99615.94","article-title":"Distinctive image features from scale-invariant keypoints","volume":"60","author":"Lowe","year":"2004","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.cviu.2024.104209_b28","doi-asserted-by":"crossref","first-page":"512","DOI":"10.1007\/s11263-018-1117-z","article-title":"Locality preserving matching","volume":"127","author":"Ma","year":"2019","journal-title":"Int. J. Comput. Vis."},{"year":"2013","series-title":"Efficient estimation of word representations in vector space","author":"Mikolov","key":"10.1016\/j.cviu.2024.104209_b29"},{"issue":"5","key":"10.1016\/j.cviu.2024.104209_b30","doi-asserted-by":"crossref","first-page":"1147","DOI":"10.1109\/TRO.2015.2463671","article-title":"ORB-SLAM: a versatile and accurate monocular SLAM system","volume":"31","author":"Mur-Artal","year":"2015","journal-title":"IEEE Trans. Robot."},{"issue":"3","key":"10.1016\/j.cviu.2024.104209_b31","doi-asserted-by":"crossref","first-page":"2346","DOI":"10.1109\/LRA.2018.2809549","article-title":"Unsupervised deep homography: A fast and robust homography estimation model","volume":"3","author":"Nguyen","year":"2018","journal-title":"IEEE Robot. Autom. Lett."},{"key":"10.1016\/j.cviu.2024.104209_b32","doi-asserted-by":"crossref","DOI":"10.1016\/j.jvcir.2020.102950","article-title":"A view-free image stitching network based on global homography","volume":"73","author":"Nie","year":"2020","journal-title":"J. Vis. Commun. Image Represent."},{"key":"10.1016\/j.cviu.2024.104209_b33","series-title":"2011 International Conference on Computer Vision","first-page":"2564","article-title":"ORB: An efficient alternative to SIFT or SURF","author":"Rublee","year":"2011"},{"key":"10.1016\/j.cviu.2024.104209_b34","doi-asserted-by":"crossref","unstructured":"Sarlin,\u00a0P.E., DeTone,\u00a0D., Malisiewicz,\u00a0T., Rabinovich,\u00a0A., 2020. Superglue: Learning feature matching with graph neural networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 4938\u20134947.","DOI":"10.1109\/CVPR42600.2020.00499"},{"key":"10.1016\/j.cviu.2024.104209_b35","doi-asserted-by":"crossref","unstructured":"Shao,\u00a0R., Wu,\u00a0G., Zhou,\u00a0Y., Fu,\u00a0Y., Fang,\u00a0L., Liu,\u00a0Y., 2021. Localtrans: A multiscale local transformer network for cross-resolution homography estimation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 14890\u201314899.","DOI":"10.1109\/ICCV48922.2021.01462"},{"key":"10.1016\/j.cviu.2024.104209_b36","doi-asserted-by":"crossref","first-page":"435","DOI":"10.1007\/s11265-019-01477-2","article-title":"An image mosaic method based on convolutional neural network semantic features extraction","volume":"92","author":"Shi","year":"2020","journal-title":"J. Signal Process. Syst."},{"key":"10.1016\/j.cviu.2024.104209_b37","doi-asserted-by":"crossref","unstructured":"Sun,\u00a0J., Shen,\u00a0Z., Wang,\u00a0Y., Bao,\u00a0H., Zhou,\u00a0X., 2021. LoFTR: Detector-free local feature matching with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 8922\u20138931.","DOI":"10.1109\/CVPR46437.2021.00881"},{"year":"2022","series-title":"Quadtree attention for vision transformers","author":"Tang","key":"10.1016\/j.cviu.2024.104209_b38"},{"key":"10.1016\/j.cviu.2024.104209_b39","doi-asserted-by":"crossref","unstructured":"Wang,\u00a0W., Xie,\u00a0E., Li,\u00a0X., Fan,\u00a0D.P., Song,\u00a0K., Liang,\u00a0D., Lu,\u00a0T., Luo,\u00a0P., Shao,\u00a0L., 2021. Pyramid vision transformer: A versatile backbone for dense prediction without convolutions. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 568\u2013578.","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"10.1016\/j.cviu.2024.104209_b40","doi-asserted-by":"crossref","unstructured":"Zaragoza,\u00a0J., Chin,\u00a0T.J., Brown,\u00a0M.S., Suter,\u00a0D., 2013. As-projective-as-possible image stitching with moving DLT. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 2339\u20132346.","DOI":"10.1109\/CVPR.2013.303"},{"issue":"11","key":"10.1016\/j.cviu.2024.104209_b41","doi-asserted-by":"crossref","first-page":"1330","DOI":"10.1109\/34.888718","article-title":"A flexible new technique for camera calibration","volume":"22","author":"Zhang","year":"2000","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.cviu.2024.104209_b42","series-title":"Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part I 16","first-page":"653","article-title":"Content-aware unsupervised deep homography estimation","author":"Zhang","year":"2020"},{"issue":"2","key":"10.1016\/j.cviu.2024.104209_b43","doi-asserted-by":"crossref","first-page":"225","DOI":"10.1109\/TCSVT.2015.2501941","article-title":"A global approach to fast video stabilization","volume":"27","author":"Zhang","year":"2015","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.cviu.2024.104209_b44","series-title":"2020 IEEE International Conference on Computational Photography","first-page":"1","article-title":"Multiscale-VR: multiscale gigapixel 3D panoramic videography for virtual reality","author":"Zhang","year":"2020"},{"key":"10.1016\/j.cviu.2024.104209_b45","doi-asserted-by":"crossref","unstructured":"Zhao,\u00a0Y., Huang,\u00a0X., Zhang,\u00a0Z., 2021. Deep lucas-kanade homography for multimodal image alignment. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 15950\u201315959.","DOI":"10.1109\/CVPR46437.2021.01569"},{"issue":"9","key":"10.1016\/j.cviu.2024.104209_b46","doi-asserted-by":"crossref","first-page":"3504","DOI":"10.1109\/TCSVT.2020.3040753","article-title":"Adaptively meshed video stabilization","volume":"31","author":"Zhao","year":"2020","journal-title":"IEEE Trans. Circuits Syst. Video Technol."}],"container-title":["Computer Vision and Image Understanding"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S107731422400290X?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S107731422400290X?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2024,12,11]],"date-time":"2024-12-11T03:05:49Z","timestamp":1733886349000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S107731422400290X"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,1]]},"references-count":46,"alternative-id":["S107731422400290X"],"URL":"https:\/\/doi.org\/10.1016\/j.cviu.2024.104209","relation":{},"ISSN":["1077-3142"],"issn-type":[{"type":"print","value":"1077-3142"}],"subject":[],"published":{"date-parts":[[2025,1]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Bilevel progressive homography estimation via correlative region-focused transformer","name":"articletitle","label":"Article Title"},{"value":"Computer Vision and Image Understanding","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.cviu.2024.104209","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2024 Elsevier Inc. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"104209"}}