{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,12]],"date-time":"2025-05-12T12:09:54Z","timestamp":1747051794534,"version":"3.40.4"},"reference-count":41,"publisher":"Springer Science and Business Media LLC","issue":"7","license":[{"start":{"date-parts":[[2024,11,16]],"date-time":"2024-11-16T00:00:00Z","timestamp":1731715200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,16]],"date-time":"2024-11-16T00:00:00Z","timestamp":1731715200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62101346","62101346","62101346","62101346"],"award-info":[{"award-number":["62101346","62101346","62101346","62101346"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2025,5]]},"DOI":"10.1007\/s00371-024-03695-7","type":"journal-article","created":{"date-parts":[[2024,11,16]],"date-time":"2024-11-16T07:57:50Z","timestamp":1731743870000},"page":"4865-4877","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Cross-view Transformer for enhanced multi-view 3D reconstruction"],"prefix":"10.1007","volume":"41","author":[{"given":"Wuzhen","family":"Shi","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Aixue","family":"Yin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yingxiang","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bo","family":"Qian","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,16]]},"reference":[{"key":"3695_CR1","unstructured":"Chang, A.X., Funkhouser, T., Guibas, L., Hanrahan, P., Huang, Q., Li, Z., Savarese, S., Savva, M., Song, S., Su, H., et\u00a0al.: Shapenet: An information-rich 3D model repository. arXiv preprint arXiv:1512.03012 (2015)"},{"key":"3695_CR2","doi-asserted-by":"crossref","unstructured":"Chen, Z., Zhu, Y., Zhao, C., Hu, G., Zeng, W., Wang, J., Tang, M.: DPT: deformable patch-based transformer for visual recognition. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 2899\u20132907 (2021)","DOI":"10.1145\/3474085.3475467"},{"key":"3695_CR3","doi-asserted-by":"crossref","unstructured":"Choy, C.B., Xu, D., Gwak, J., Chen, K., Savarese, S.: 3D-R2N2: a unified approach for single and multi-view 3D object reconstruction. In: ECCV 2016 (2016)","DOI":"10.1007\/978-3-319-46484-8_38"},{"key":"3695_CR4","unstructured":"Dosovitskiy, A., Beyer, L.: An image is worth 16$$\\times $$16 words: transformers for image recognition at scale. In: ICLR 2021\u20149th International Conference on Learning Representations (2021)"},{"key":"3695_CR5","doi-asserted-by":"publisher","first-page":"135","DOI":"10.1016\/j.knosys.2016.02.001","volume":"99","author":"M Du","year":"2016","unstructured":"Du, M., Ding, S., Jia, H.: Study on density peaks clustering based on k-nearest neighbors and principal component analysis. Knowl. Based Syst. 99, 135\u2013145 (2016)","journal-title":"Knowl. Based Syst."},{"key":"3695_CR6","doi-asserted-by":"publisher","first-page":"55","DOI":"10.1007\/s10462-012-9365-8","volume":"43","author":"J Fuentes-Pacheco","year":"2015","unstructured":"Fuentes-Pacheco, J., Ruiz-Ascencio, J., Rend\u00f3n-Mancha, J.M.: Visual simultaneous localization and mapping: a survey. Artif. Intell. Rev. 43, 55\u201381 (2015)","journal-title":"Artif. Intell. Rev."},{"key":"3695_CR7","doi-asserted-by":"crossref","unstructured":"Gupta, A., Dar, G., Goodman, S., Ciprut, D., Berant, J.: Memory-efficient transformers via top-$$ k $$ attention. In: SustaiNLP 2021\u20142nd Workshop on Simple and Efficient Natural Language Processing, pp. 39\u201352 (2021)","DOI":"10.18653\/v1\/2021.sustainlp-1.5"},{"key":"3695_CR8","doi-asserted-by":"crossref","unstructured":"Han, D., Pan, X., Han, Y., Song, S., Huang, G.: Flatten transformer: vision transformer using focused linear attention. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5961\u20135971 (2023)","DOI":"10.1109\/ICCV51070.2023.00548"},{"key":"3695_CR9","doi-asserted-by":"crossref","unstructured":"Han, D., Ye, T., Han, Y., Xia, Z., Song, S., Huang, G.: Agent attention: on the integration of softmax and linear attention. arXiv preprint arXiv:2312.08874 (2023)","DOI":"10.1007\/978-3-031-72973-7_8"},{"key":"3695_CR10","unstructured":"Kar, A., H\u00e4ne, C., Malik, J.: Learning a multi-view stereo machine. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"issue":"1","key":"3695_CR11","doi-asserted-by":"publisher","first-page":"532","DOI":"10.1109\/TNNLS.2022.3175775","volume":"35","author":"A Karambakhsh","year":"2022","unstructured":"Karambakhsh, A., Sheng, B., Li, P., Li, H., Kim, J., Jung, Y., Chen, C.P.: SparseVoxNet: 3-D object recognition with sparsely aggregation of 3-D dense blocks. IEEE Trans. Neural Netw. Learn. Syst. 35(1), 532\u2013546 (2022)","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"3695_CR12","unstructured":"Katharopoulos, A., Vyas, A., Pappas, N., Fleuret, F.: Transformers are RNNs: fast autoregressive transformers with linear attention. In: International Conference on Machine Learning, pp. 5156\u20135165. PMLR (2020)"},{"key":"3695_CR13","doi-asserted-by":"crossref","unstructured":"Lin, X., Sun, S., Huang, W., Sheng, B., Li, P., Feng, D.D.: EAPT: efficient attention pyramid transformer for image processing. IEEE Trans. Multimed. 25, 50\u201361 (2021)","DOI":"10.1109\/TMM.2021.3120873"},{"key":"3695_CR14","unstructured":"Liu, R., Deng, H., Huang, Y., Shi, X., Lu, L., Sun, W., Wang, X., Dai, J., Li, H.: Decoupled spatial-temporal transformer for video inpainting. arXiv preprint arXiv:2104.06637 (2021)"},{"key":"3695_CR15","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., Guo, B.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"3695_CR16","doi-asserted-by":"crossref","unstructured":"Lorensen, W.E., Cline, H.E.: Marching cubes: a high resolution 3D surface construction algorithm. In: Seminal Graphics: Pioneering Efforts that Shaped the Field, pp. 347\u2013353 (1998)","DOI":"10.1145\/280811.281026"},{"key":"3695_CR17","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: 7th International Conference on Learning Representations, ICLR 2019 (2019)"},{"key":"3695_CR18","doi-asserted-by":"crossref","unstructured":"Milletari, F., Navab, N., Ahmadi, S.A.: V-net: fully convolutional neural networks for volumetric medical image segmentation. In: 2016 Fourth International Conference on 3D Vision (3DV), pp. 565\u2013571. IEEE (2016)","DOI":"10.1109\/3DV.2016.79"},{"key":"3695_CR19","doi-asserted-by":"publisher","first-page":"305","DOI":"10.1017\/S096249291700006X","volume":"26","author":"O \u00d6zye\u015fil","year":"2017","unstructured":"\u00d6zye\u015fil, O., Voroninski, V., Basri, R., Singer, A.: A survey of structure from motion. Acta Numer. 26, 305\u2013364 (2017)","journal-title":"Acta Numer."},{"key":"3695_CR20","unstructured":"Park, N., Kim, S.: How do vision transformers work? In: ICLR 2022\u201410th International Conference on Learning Representations (2022)"},{"key":"3695_CR21","unstructured":"Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., et\u00a0al.: Pytorch: an imperative style, high-performance deep learning library. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"issue":"8","key":"3695_CR22","doi-asserted-by":"publisher","first-page":"3597","DOI":"10.1007\/s00371-023-02922-x","volume":"39","author":"Y Qin","year":"2023","unstructured":"Qin, Y., Chi, X., Sheng, B., Lau, R.W.: Guiderender: large-scale scene navigation based on multi-modal view frustum movement prediction. Vis. Comput. 39(8), 3597\u20133607 (2023)","journal-title":"Vis. Comput."},{"key":"3695_CR23","unstructured":"Shi, Z., Meng, Z., Xing, Y., Ma, Y., Wattenhofer, R.: 3D-RETR: end-to-end single and multi-view 3d reconstruction with transformers. In: 32nd British Machine Vision Conference, BMVC 2021 (2021)"},{"key":"3695_CR24","doi-asserted-by":"crossref","unstructured":"Tatarchenko, M., Richter, S.R., Ranftl, R., Li, Z., Koltun, V., Brox, T.: What do single-view 3D reconstruction networks learn? In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3405\u20133414 (2019)","DOI":"10.1109\/CVPR.2019.00352"},{"key":"3695_CR25","doi-asserted-by":"crossref","unstructured":"Tiong, L.C.O., Sigmund, D., Teoh, A.B.J.: 3D-C2FT: coarse-to-fine transformer for multi-view 3D reconstruction. In: Proceedings of the Asian Conference on Computer Vision, pp. 1438\u20131454 (2022)","DOI":"10.1007\/978-3-031-26319-4_13"},{"key":"3695_CR26","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., J\u00e9gou, H.: Training data-efficient image transformers & distillation through attention. In: International Conference on Machine Learning, pp. 10347\u201310357. PMLR (2021)"},{"key":"3695_CR27","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"3695_CR28","doi-asserted-by":"crossref","unstructured":"Wang, D., Cui, X., Chen, X., Zou, Z., Shi, T., Salcudean, S., Wang, Z.J., Ward, R.: Multi-view 3D reconstruction with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5722\u20135731 (2021)","DOI":"10.1109\/ICCV48922.2021.00567"},{"key":"3695_CR29","doi-asserted-by":"publisher","first-page":"6142","DOI":"10.1109\/TIP.2021.3092814","volume":"30","author":"Y Wen","year":"2021","unstructured":"Wen, Y., Chen, J., Sheng, B., Chen, Z., Li, P., Tan, P., Lee, T.Y.: Structure-aware motion deblurring using multi-adversarial optimized cyclegan. IEEE Trans. Image Process. 30, 6142\u20136155 (2021)","journal-title":"IEEE Trans. Image Process."},{"issue":"2","key":"3695_CR30","doi-asserted-by":"publisher","first-page":"994","DOI":"10.1109\/TIP.2018.2874285","volume":"28","author":"Y Wen","year":"2019","unstructured":"Wen, Y., Sheng, B., Li, P., Lin, W., Feng, D.D.: Deep color guided coarse-to-fine convolutional network cascade for depth image super-resolution. IEEE Trans. Image Process. 28(2), 994\u20131006 (2019)","journal-title":"IEEE Trans. Image Process."},{"key":"3695_CR31","doi-asserted-by":"crossref","unstructured":"Xia, Z., Pan, X., Song, S., Li, L.E., Huang, G.: Vision transformer with deformable attention. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4794\u20134803 (2022)","DOI":"10.1109\/CVPR52688.2022.00475"},{"key":"3695_CR32","doi-asserted-by":"crossref","unstructured":"Xie, H., Yao, H., Sun, X., Zhou, S., Zhang, S.: Pix2Vox: context-aware 3D reconstruction from single and multi-view images. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2690\u20132698 (2019)","DOI":"10.1109\/ICCV.2019.00278"},{"issue":"12","key":"3695_CR33","doi-asserted-by":"publisher","first-page":"2919","DOI":"10.1007\/s11263-020-01347-6","volume":"128","author":"H Xie","year":"2020","unstructured":"Xie, H., Yao, H., Zhang, S., Zhou, S., Sun, W.: Pix2Vox++: multi-scale context-aware 3D object reconstruction from single and multiple images. Int. J. Comput. Vis. 128(12), 2919\u20132935 (2020)","journal-title":"Int. J. Comput. Vis."},{"key":"3695_CR34","unstructured":"Yagubbayli, F., Wang, Y., Tonioni, A., Tombari, F.: Legoformer: Transformers for block-by-block multi-view 3d reconstruction. arXiv preprint arXiv:2106.12102 (2021)"},{"issue":"1","key":"3695_CR35","doi-asserted-by":"publisher","first-page":"53","DOI":"10.1007\/s11263-019-01217-w","volume":"128","author":"B Yang","year":"2020","unstructured":"Yang, B., Wang, S., Markham, A., Trigoni, N.: Robust attentional aggregation of deep feature sets for multi-view 3d reconstruction. Int. J. Comput. Vis. 128(1), 53\u201373 (2020)","journal-title":"Int. J. Comput. Vis."},{"key":"3695_CR36","doi-asserted-by":"crossref","unstructured":"Yang, L., Zhu, Z., Lin, X., Nong, J., Liang, Y.: Long-range grouping transformer for multi-view 3D reconstruction. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 18257\u201318267 (2023)","DOI":"10.1109\/ICCV51070.2023.01674"},{"key":"3695_CR37","doi-asserted-by":"crossref","unstructured":"Zeng, W., Jin, S., Liu, W., Qian, C., Luo, P., Ouyang, W., Wang, X.: Not all tokens are equal: human-centric visual analysis via token clustering transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11101\u201311111 (2022)","DOI":"10.1109\/CVPR52688.2022.01082"},{"key":"3695_CR38","doi-asserted-by":"crossref","unstructured":"Zeng, Y., Fu, J., Chao, H.: Learning joint spatial-temporal transformations for video inpainting. In: Computer Vision\u2014ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XVI 16, pp. 528\u2013543. Springer (2020)","DOI":"10.1007\/978-3-030-58517-4_31"},{"key":"3695_CR39","doi-asserted-by":"crossref","unstructured":"Zhu, L., Wang, X., Ke, Z., Zhang, W., Lau, R.W.: Biformer: vision transformer with bi-level routing attention. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10323\u201310333 (2023)","DOI":"10.1109\/CVPR52729.2023.00995"},{"key":"3695_CR40","doi-asserted-by":"crossref","unstructured":"Zhu, Z., Yang, L., Li, N., Jiang, C., Liang, Y.: Umiformer: mining the correlations between similar tokens for multi-view 3D reconstruction. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 18226\u201318235 (2023)","DOI":"10.1109\/ICCV51070.2023.01671"},{"key":"3695_CR41","doi-asserted-by":"publisher","first-page":"109674","DOI":"10.1016\/j.patcog.2023.109674","volume":"142","author":"Z Zhu","year":"2023","unstructured":"Zhu, Z., Yang, L., Lin, X., Yang, L., Liang, Y.: Garnet: global-aware multi-view 3D reconstruction network and the cost-performance tradeoff. Pattern Recognit 142, 109674 (2023)","journal-title":"Pattern Recognit"}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-024-03695-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-024-03695-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-024-03695-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,24]],"date-time":"2025-04-24T10:03:44Z","timestamp":1745489024000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-024-03695-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,16]]},"references-count":41,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2025,5]]}},"alternative-id":["3695"],"URL":"https:\/\/doi.org\/10.1007\/s00371-024-03695-7","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"type":"print","value":"0178-2789"},{"type":"electronic","value":"1432-2315"}],"subject":[],"published":{"date-parts":[[2024,11,16]]},"assertion":[{"value":"18 October 2024","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 November 2024","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}