{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T18:01:52Z","timestamp":1773511312152,"version":"3.50.1"},"reference-count":56,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2019,8,28]],"date-time":"2019-08-28T00:00:00Z","timestamp":1566950400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"},{"start":{"date-parts":[[2019,8,28]],"date-time":"2019-08-28T00:00:00Z","timestamp":1566950400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"}],"funder":[{"DOI":"10.13039\/501100000769","name":"University of Oxford","doi-asserted-by":"crossref","id":[{"id":"10.13039\/501100000769","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2020,1]]},"DOI":"10.1007\/s11263-019-01217-w","type":"journal-article","created":{"date-parts":[[2019,8,28]],"date-time":"2019-08-28T06:02:36Z","timestamp":1566972156000},"page":"53-73","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":110,"title":["Robust Attentional Aggregation of Deep Feature Sets for Multi-view 3D Reconstruction"],"prefix":"10.1007","volume":"128","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2419-4140","authenticated-orcid":false,"given":"Bo","family":"Yang","sequence":"first","affiliation":[]},{"given":"Sen","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Andrew","family":"Markham","sequence":"additional","affiliation":[]},{"given":"Niki","family":"Trigoni","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,8,28]]},"reference":[{"key":"1217_CR1","unstructured":"Bahdanau, D., Cho, K., & Bengio, Y. (2015). Neural machine translation by jointly learning to align and translate. In International conference on learning representations."},{"issue":"2","key":"1217_CR2","doi-asserted-by":"publisher","first-page":"157","DOI":"10.1109\/72.279181","volume":"5","author":"Y Bengio","year":"1994","unstructured":"Bengio, Y., Simard, P., & Frasconi, P. (1994). Learning long-term dependencies with gradient descent is difficult. IEEE Transactions on Neural Networks, 5(2), 157\u2013166.","journal-title":"IEEE Transactions on Neural Networks"},{"issue":"6","key":"1217_CR3","doi-asserted-by":"publisher","first-page":"1309","DOI":"10.1109\/TRO.2016.2624754","volume":"32","author":"C Cadena","year":"2016","unstructured":"Cadena, C., Carlone, L., Carrillo, H., Latif, Y., Scaramuzza, D., Neira, J., et al. (2016). Past, present, and future of simultaneous localization and mapping: Towards the robust-perception age. IEEE Transactions on Robotics, 32(6), 1309\u20131332.","journal-title":"IEEE Transactions on Robotics"},{"key":"1217_CR4","doi-asserted-by":"crossref","unstructured":"Cao, Y. P., Liu, Z. N., Kuang, Z. F., Kobbelt, L., & Hu, S. M. (2018). Learning to reconstruct high-quality 3D shapes with cascaded fully convolutional networks. In European conference on computer vision (pp. 616\u2013633).","DOI":"10.1007\/978-3-030-01240-3_38"},{"key":"1217_CR5","unstructured":"Chang, A. X., Funkhouser, T., Guibas, L., Hanrahan, P., Huang, Q., Li, Z., Savarese, S., Savva, M., Song, S., Su, H., Xiao, J., Yi, L., & Yu, F. (2015). ShapeNet: An information-rich 3D model repository. \narXiv:1512.03012\n\n."},{"key":"1217_CR6","doi-asserted-by":"crossref","unstructured":"Choy, C. B., Xu, D., Gwak, J., Chen, K., & Savarese, S. (2016). 3D-R2N2: A unified approach for single and multi-view 3D object reconstruction. In European conference on computer vision.","DOI":"10.1007\/978-3-319-46484-8_38"},{"key":"1217_CR7","doi-asserted-by":"crossref","unstructured":"Curless, B., & Levoy, M. (1996). A volumetric method for building complex models from range images. In Conference on computer graphics and interactive techniques (pp. 303\u2013312).","DOI":"10.1145\/237170.237269"},{"key":"1217_CR8","doi-asserted-by":"crossref","unstructured":"Dong, W., Wang, Q., Wang, X., & Zha, H. (2018). PSDF fusion: Probabilistic signed distance function for on-the-fly 3D data fusion and scene reconstruction. In European conference on computer vision (pp. 714\u2013730).","DOI":"10.1007\/978-3-030-01240-3_43"},{"issue":"6394","key":"1217_CR9","doi-asserted-by":"publisher","first-page":"1204","DOI":"10.1126\/science.aar6170","volume":"360","author":"SA Eslami","year":"2018","unstructured":"Eslami, S. A., Rezende, D. J., Besse, F., Viola, F., Morcos, A. S., Garnelo, M., et al. (2018). Neural scene representation and rendering. Science, 360(6394), 1204\u20131210.","journal-title":"Science"},{"key":"1217_CR10","doi-asserted-by":"crossref","unstructured":"Fan, H., Su, H., & Guibas, L. (2017). A point set generation network for 3D object reconstruction from a single image. In IEEE conference on computer vision and pattern recognition (pp. 605\u2013613).","DOI":"10.1109\/CVPR.2017.264"},{"key":"1217_CR11","unstructured":"Gardner, A., Kanno, J., Duncan, C. A., & Selmic, R. R. (2017). Classifying unordered feature sets with convolutional deep averaging networks. \narXiv:1709.03019\n\n."},{"key":"1217_CR12","unstructured":"Girdhar, R., & Ramanan, D. (2017). Attentional pooling for action recognition. In International conference on neural information processing systems (pp. 33\u201344)."},{"key":"1217_CR13","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511811685","volume-title":"Multiple view geometry in computer vision","author":"R Hartley","year":"2004","unstructured":"Hartley, R., & Zisserman, A. (2004). Multiple view geometry in computer vision. Cambridge: Cambridge University Press."},{"key":"1217_CR14","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In IEEE conference on computer vision and pattern recognition (pp. 770\u2013778).","DOI":"10.1109\/CVPR.2016.90"},{"key":"1217_CR15","volume-title":"A field guide to dynamical recurrent networks","author":"S Hochreiter","year":"2001","unstructured":"Hochreiter, S., Bengio, Y., Frasconi, P., & Schmidhuber, J. (2001). Gradient flow in recurrent nets: The difficulty of learning long term dependencies. In J. F. Kolen & S. C. Kremer (Eds.), A field guide to dynamical recurrent networks. New York: Wiley."},{"key":"1217_CR16","doi-asserted-by":"crossref","unstructured":"Hu, J., Shen, L., Albanie, S., Sun, G., & Wu, E. (2018). Squeeze-and-excitation networks. IEEE conference on computer vision and pattern recognition (pp. 7132\u20137141).","DOI":"10.1109\/CVPR.2018.00745"},{"key":"1217_CR17","doi-asserted-by":"crossref","unstructured":"Huang, P. H., Matzen, K., Kopf, J., Ahuja, N., & Huang, J. B. (2018). DeepMVS: Learning multi-view stereopsis. In IEEE conference on computer vision and pattern recognition (pp. 2821\u20132830).","DOI":"10.1109\/CVPR.2018.00298"},{"key":"1217_CR18","unstructured":"Ilse, M., Tomczak, J. M., & Welling, M. (2018). Attention-based deep multiple instance learning. In International conference on machine learning (pp. 2127\u20132136)."},{"key":"1217_CR19","doi-asserted-by":"crossref","unstructured":"Ionescu, C., Vantzos, O., & Sminchisescu, C. (2015). Matrix backpropagation for deep networks with structured layers. In IEEE international conference on computer vision (pp. 2965\u20132973).","DOI":"10.1109\/ICCV.2015.339"},{"key":"1217_CR20","doi-asserted-by":"crossref","unstructured":"Ji, M., Gall, J., Zheng, H., Liu, Y., & Fang, L. (2017a). SurfaceNet: An end-to-end 3D neural network for multiview stereopsis. In IEEE international conference on computer vision (pp. 2326\u20132334).","DOI":"10.1109\/ICCV.2017.253"},{"key":"1217_CR21","doi-asserted-by":"crossref","unstructured":"Ji, P., Li, H., Dai, Y., & Reid, I. (2017b). \u201cMaximizing rigidity\u201d revisited: A convex programming approach for generic 3D shape reconstruction from multiple perspective views. IEEE international conference on computer vision (pp. 929\u2013937).","DOI":"10.1109\/ICCV.2017.106"},{"key":"1217_CR22","unstructured":"Kar, A., H\u00e4ne, C., & Malik, J. (2017). Learning a multi-view stereo machine. In International conference on neural information processing systems (pp. 364\u2013375)."},{"key":"1217_CR23","doi-asserted-by":"crossref","unstructured":"Kumar, S., Dai, Y., & Li, H. (2017). Monocular dense 3D reconstruction of a complex dynamic scene from two perspective frames. In IEEE international conference on computer vision (pp. 4649\u20134657).","DOI":"10.1109\/ICCV.2017.498"},{"key":"1217_CR24","unstructured":"Li, H., Xiong, P., An, J., & Wang, L. (2018). Pyramid attention network for semantic segmentation. \narXiv:1805.10180\n\n."},{"key":"1217_CR25","doi-asserted-by":"crossref","unstructured":"Lin, T. Y., & Maji, S. (2017). Improved bilinear pooling with CNNs. In British machine vision conference.","DOI":"10.5244\/C.31.117"},{"key":"1217_CR26","doi-asserted-by":"crossref","unstructured":"Lin, T. Y., Maji, S., & Koniusz, P. (2018). Second-order democratic aggregation. In European conference on computer vision (pp. 620\u2013636).","DOI":"10.1007\/978-3-030-01219-9_38"},{"key":"1217_CR27","doi-asserted-by":"crossref","unstructured":"Lin, T. Y., Roychowdhury, A., & Maji, S. (2015). Bilinear CNN models for fine-grained visual recognition. In IEEE international conference on computer vision (pp 1449\u20131457).","DOI":"10.1109\/ICCV.2015.170"},{"key":"1217_CR28","doi-asserted-by":"crossref","unstructured":"Liu, X., Kumar, B. V., Yang, C., Tang, Q., & You, J. (2018). Dependency-aware attention control for unconstrained face recognition with image sets. In European conference on computer vision (pp 548\u2013565).","DOI":"10.1007\/978-3-030-01252-6_34"},{"issue":"2","key":"1217_CR29","doi-asserted-by":"publisher","first-page":"91","DOI":"10.1023\/B:VISI.0000029664.99615.94","volume":"60","author":"DG Lowe","year":"2004","unstructured":"Lowe, D. G. (2004). Distinctive image features from scale-invariant keypoints. International Journal of Computer Vision, 60(2), 91\u2013110.","journal-title":"International Journal of Computer Vision"},{"key":"1217_CR30","unstructured":"Martin, E., & Cundy, C. (2018). Parallelizing linear recurrent neural nets over sequence length. In International conference on learning representations."},{"key":"1217_CR31","unstructured":"Nakka, K. K., & Salzmann, M. (2018). Deep attentional structured representation learning for visual recognition. In British machine vision conference."},{"key":"1217_CR32","doi-asserted-by":"publisher","first-page":"305","DOI":"10.1017\/S096249291700006X","volume":"26","author":"O Ozyesil","year":"2017","unstructured":"Ozyesil, O., Voroninski, V., Basri, R., & Singer, A. (2017). A survey of structure from motion. Acta Numerica, 26, 305\u2013364.","journal-title":"Acta Numerica"},{"key":"1217_CR33","doi-asserted-by":"crossref","unstructured":"Paschalidou, D., Ulusoy, A. O., Schmitt, C., Van Gool, L., & Geiger, A. (2018). RayNet: Learning volumetric 3D reconstruction with ray potentials. In IEEE conference on computer vision and pattern recognition (pp. 3897\u20133906).","DOI":"10.1109\/CVPR.2018.00410"},{"key":"1217_CR34","unstructured":"Qi, C. R., Su, H., Mo, K., & Guibas, L. J. (2017). PointNet: Deep learning on point sets for 3D classification and segmentation. In IEEE conference on computer vision and pattern recognition (pp. 652\u2013660)."},{"key":"1217_CR35","doi-asserted-by":"crossref","unstructured":"Qi, C. R., Su, H., Nie\u00dfner, M., Dai, A., Yan, M., Guibas, L. J. (2016). Volumetric and multi-view CNNs for object classification on 3D data. In IEEE conference on computer vision and pattern recognition (pp. 5648\u20135656).","DOI":"10.1109\/CVPR.2016.609"},{"key":"1217_CR36","unstructured":"Raffel, C., & Ellis, D. P. W. (2016). Feed-forward networks with attention can solve some long-term memory problems. In International conference on learning representations workshops."},{"key":"1217_CR37","doi-asserted-by":"crossref","unstructured":"Riegler, G., Ulusoy, A. O., Bischof, H., & Geiger, A. (2017). OctNetFusion: Learning depth fusion from data. In International conference on 3D vision (pp. 57\u201366).","DOI":"10.1109\/3DV.2017.00017"},{"key":"1217_CR38","doi-asserted-by":"crossref","unstructured":"Rodr\u00edguez, P., Gonfaus, J. M., Cucurull, G., Roca, F. X., & Gonz\u00e0lez, J. (2018). Attend and rectify: A gated attention mechanism for fine-grained recovery. In European conference on computer vision (pp. 349\u2013364).","DOI":"10.1007\/978-3-030-01237-3_22"},{"key":"1217_CR39","doi-asserted-by":"crossref","unstructured":"Sarafianos, N., Xu, X., & Kakadiaris, I. A. (2018). Deep imbalanced attribute classification using visual attention aggregation. European conference on computer vision (pp. 680\u2013697).","DOI":"10.1007\/978-3-030-01252-6_42"},{"key":"1217_CR40","doi-asserted-by":"crossref","unstructured":"Su, H., Maji, S., Kalogerakis, E., & Learned-Miller, E. (2015). Multi-view convolutional neural networks for 3D shape recognition. In IEEE international conference on computer vision (pp. 945\u2013953).","DOI":"10.1109\/ICCV.2015.114"},{"key":"1217_CR41","doi-asserted-by":"crossref","unstructured":"Tatarchenko, M., Dosovitskiy, A., & Brox, T. (2017). Octree generating networks: Efficient convolutional architectures for high-resolution 3D outputs. In IEEE international conference on computer vision (pp. 2088\u20132096).","DOI":"10.1109\/ICCV.2017.230"},{"key":"1217_CR42","unstructured":"Triggs, B., McLauchlan, P. F., Hartley, R. I., & Fitzgibbon, A. W. (1999). Bundle adjustment: A modern synthesis. In International workshop on vision algorithms."},{"key":"1217_CR43","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., et al. (2017). Attention is all you need. In International conference on neural information processing systems."},{"key":"1217_CR44","unstructured":"Vinyals, O., Bengio, S., & Kudlur, M. (2015). Order matters: Sequence to sequence for sets. In International conference on learning representations."},{"key":"1217_CR45","doi-asserted-by":"crossref","unstructured":"Wiles, O., & Zisserman, A. (2017). SilNet: Single- and multi-view reconstruction by learning from silhouettes. In British machine vision conference.","DOI":"10.5244\/C.31.99"},{"issue":"11-12","key":"1217_CR46","doi-asserted-by":"publisher","first-page":"1780","DOI":"10.1007\/s11263-018-1124-0","volume":"127","author":"Olivia Wiles","year":"2018","unstructured":"Wiles, O., & Zisserman, A. (2018). Learning to predict 3D surfaces of sculptures from single and multiple views. International Journal of Computer Vision. \nhttps:\/\/doi.org\/10.1007\/s11263-018-1124-0\n\n.","journal-title":"International Journal of Computer Vision"},{"key":"1217_CR47","unstructured":"Wu, Z., Song, S., Khosla, A., Yu, F., Zhang, L., Tang, X., & Xiao, J. (2015). 3D ShapeNets: A deep representation for volumetric shapes. In IEEE conference on computer vision and pattern recognition (pp. 1912\u20131920)."},{"key":"1217_CR48","unstructured":"Xu, K., Ba, J. L., Kiros, R., Cho, K., Courville, A., Salakhutdinov, R., Zemel, R. S., & Bengio, Y. (2015). Show, attend and tell: Neural image caption generation with visual attention. In International conference on machine learning (pp. 2048\u20132057)."},{"key":"1217_CR49","doi-asserted-by":"crossref","unstructured":"Yang, Z., He, X., Gao, J., Deng, L., & Smola, A. (2016). Stacked attention networks for image question answering. In IEEE conference on computer vision and pattern recognition (pp. 21\u201329).","DOI":"10.1109\/CVPR.2016.10"},{"key":"1217_CR50","doi-asserted-by":"crossref","unstructured":"Yang, X., Wang, Y., Wang, Y., Yin, B., Zhang, Q., Wei, X., & Fu, H. (2018). Active object reconstruction using a guided view planner. In International joint conference on artificial intelligence (pp. 4965\u20134971).","DOI":"10.24963\/ijcai.2018\/689"},{"key":"1217_CR51","doi-asserted-by":"crossref","unstructured":"Yao, Y., Luo, Z., Li, S., Fang, T., & Quan, L. (2018). MVSNet: Depth inference for unstructured multi-view stereo. In European conference on computer vision (pp. 767\u2013783).","DOI":"10.1007\/978-3-030-01237-3_47"},{"key":"1217_CR52","doi-asserted-by":"crossref","unstructured":"Yu, T., Meng, J., & Yuan, J. (2018). Multi-view harmonized bilinear network for 3D object recognition. In IEEE conference on computer vision and pattern recognition (pp. 186\u2013194).","DOI":"10.1109\/CVPR.2018.00027"},{"key":"1217_CR53","doi-asserted-by":"crossref","unstructured":"Yu, K., & Salzmann, M. (2018). Statistically motivated second order pooling. In European conference on computer vision (pp. 600\u2013616).","DOI":"10.1007\/978-3-030-01234-2_37"},{"key":"1217_CR54","unstructured":"Zaheer, M., Kottur, S., Ravanbakhsh, S., Poczos, B., Salakhutdinov, R., & Smola, A. (2017). Deep sets. In International conference on neural information processing systems."},{"key":"1217_CR55","unstructured":"Zhang, H., Goodfellow, I., Metaxas, D., & Odena, A. (2018). Self-attention generative adversarial networks. \narXiv:1805.08318\n\n."},{"key":"1217_CR56","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Wang, J., Xie, L., & Zheng, L. (2018). Attention-based pyramid aggregation network for visual place recognition. In ACM international conference on multimedia.","DOI":"10.1145\/3240508.3240525"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-019-01217-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11263-019-01217-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-019-01217-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,8,26]],"date-time":"2020-08-26T23:32:35Z","timestamp":1598484755000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11263-019-01217-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,8,28]]},"references-count":56,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2020,1]]}},"alternative-id":["1217"],"URL":"https:\/\/doi.org\/10.1007\/s11263-019-01217-w","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019,8,28]]},"assertion":[{"value":"3 October 2018","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 August 2019","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 August 2019","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}