{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:11:59Z","timestamp":1777594319800,"version":"3.51.4"},"reference-count":51,"publisher":"Institute of Electronics, Information and Communications Engineers (IEICE)","issue":"7","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEICE Trans. Inf. &amp; Syst."],"published-print":{"date-parts":[[2021,7,1]]},"DOI":"10.1587\/transinf.2020edp7235","type":"journal-article","created":{"date-parts":[[2021,6,30]],"date-time":"2021-06-30T22:20:59Z","timestamp":1625091659000},"page":"1039-1048","source":"Crossref","is-referenced-by-count":4,"title":["Attention Voting Network with Prior Distance Augmented Loss for 6DoF Pose Estimation"],"prefix":"10.1587","volume":"E104.D","author":[{"given":"Yong","family":"HE","sequence":"first","affiliation":[{"name":"School of Computer Science, Chongqing University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ji","family":"LI","sequence":"additional","affiliation":[{"name":"School of Computer Science, Chongqing University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xuanhong","family":"ZHOU","sequence":"additional","affiliation":[{"name":"School of Computer Science, Chongqing University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zewei","family":"CHEN","sequence":"additional","affiliation":[{"name":"School of Computer Science, Chongqing University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xin","family":"LIU","sequence":"additional","affiliation":[{"name":"School of Computer Science, Chongqing University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"532","reference":[{"key":"1","doi-asserted-by":"crossref","unstructured":"[1] D. Xu, D. Anguelov, and A. Jain, \u201cPointfusion: Deep sensor fusion for 3d bounding box estimation,\u201d Proc. IEEE Conference on Computer Vision and Pattern Recognition, pp.244-253, 2018. 10.1109\/cvpr.2018.00033","DOI":"10.1109\/CVPR.2018.00033"},{"key":"2","doi-asserted-by":"crossref","unstructured":"[2] A. Geiger, P. Lenz, and R. Urtasun, \u201cAre we ready for autonomous driving? the kitti vision benchmark suite,\u201d 2012 IEEE Conference on Computer Vision and Pattern Recognition, pp.3354-3361, IEEE, 2012. 10.1109\/cvpr.2012.6248074","DOI":"10.1109\/CVPR.2012.6248074"},{"key":"3","unstructured":"[3] J. Tremblay, T. To, B. Sundaralingam, Y. Xiang, D. Fox, and S. Birchfield, \u201cDeep object pose estimation for semantic robotic grasping of household objects,\u201d arXiv preprint arXiv:1809.10790, 2018."},{"key":"4","doi-asserted-by":"crossref","unstructured":"[4] M. Zhu, K.G. Derpanis, Y. Yang, S. Brahmbhatt, M. Zhang, C. Phillips, M. Lecce, and K. Daniilidis, \u201cSingle image 3d object detection and pose estimation for grasping,\u201d 2014 IEEE International Conference on Robotics and Automation (ICRA), pp.3936-3943, IEEE, 2014. 10.1109\/icra.2014.6907430","DOI":"10.1109\/ICRA.2014.6907430"},{"key":"5","doi-asserted-by":"publisher","unstructured":"[5] E. Marchand, H. Uchiyama, and F. Spindler, \u201cPose estimation for augmented reality: a hands-on survey,\u201d IEEE Trans. Vis. Comput. Graphics, vol.22, no.12, pp.2633-2651, 2015. 10.1109\/tvcg.2015.2513408","DOI":"10.1109\/TVCG.2015.2513408"},{"key":"6","doi-asserted-by":"crossref","unstructured":"[6] Y. Xiang, T. Schmidt, V. Narayanan, and D. Fox, \u201cPosecnn: A convolutional neural network for 6d object pose estimation in cluttered scenes,\u201d arXiv preprint arXiv:1711.00199, 2017.","DOI":"10.15607\/RSS.2018.XIV.019"},{"key":"7","doi-asserted-by":"crossref","unstructured":"[7] S. Peng, Y. Liu, Q. Huang, X. Zhou, and H. Bao, \u201cPvnet: Pixel-wise voting network for 6dof pose estimation,\u201d Proc. IEEE Conference on Computer Vision and Pattern Recognition, pp.4561-4570, 2019. 10.1109\/cvpr.2019.00469","DOI":"10.1109\/CVPR.2019.00469"},{"key":"8","doi-asserted-by":"crossref","unstructured":"[8] S. Zakharov, I. Shugurov, and S. Ilic, \u201cDpod: 6d pose object detector and refiner,\u201d Proc. IEEE International Conference on Computer Vision, pp.1941-1950, 2019. 10.1109\/iccv.2019.00203","DOI":"10.1109\/ICCV.2019.00203"},{"key":"9","doi-asserted-by":"crossref","unstructured":"[9] Y. Hu, J. Hugonot, P. Fua, and M. Salzmann, \u201cSegmentation-driven 6d object pose estimation,\u201d Proc. IEEE Conference on Computer Vision and Pattern Recognition, pp.3385-3394, 2019. 10.1109\/cvpr.2019.00350","DOI":"10.1109\/CVPR.2019.00350"},{"key":"10","doi-asserted-by":"crossref","unstructured":"[10] Z. Li, G. Wang, and X. Ji, \u201cCdpn: Coordinates-based disentangled pose network for real-time rgb-based 6-dof object pose estimation,\u201d Proc. IEEE International Conference on Computer Vision, pp.7678-7687, 2019. 10.1109\/iccv.2019.00777","DOI":"10.1109\/ICCV.2019.00777"},{"key":"11","doi-asserted-by":"crossref","unstructured":"[11] C. Li, J. Bai, and G.D. Hager, \u201cA unified framework for multi-view multi-class object pose estimation,\u201d Proc. European Conference on Computer Vision (ECCV), pp.254-269, 2018. 10.1007\/978-3-030-01270-0_16","DOI":"10.1007\/978-3-030-01270-0_16"},{"key":"12","doi-asserted-by":"crossref","unstructured":"[12] H. Wang, S. Sridhar, J. Huang, J. Valentin, S. Song, and L.J. Guibas, \u201cNormalized object coordinate space for category-level 6d object pose and size estimation,\u201d Proc. IEEE Conference on Computer Vision and Pattern Recognition, pp.2642-2651, 2019. 10.1109\/cvpr.2019.00275","DOI":"10.1109\/CVPR.2019.00275"},{"key":"13","doi-asserted-by":"crossref","unstructured":"[13] C. Wang, D. Xu, Y. Zhu, R. Mart\u00edn-Mart\u00edn, C. Lu, L. Fei-Fei, and S. Savarese, \u201cDensefusion: 6d object pose estimation by iterative dense fusion,\u201d Proc. IEEE Conference on Computer Vision and Pattern Recognition, pp.3343-3352, 2019. 10.1109\/cvpr.2019.00346","DOI":"10.1109\/CVPR.2019.00346"},{"key":"14","doi-asserted-by":"crossref","unstructured":"[14] Y. He, W. Sun, H. Huang, J. Liu, H. Fan, and J. Sun, \u201cPvn3d: A deep point-wise 3d keypoints voting network for 6dof pose estimation,\u201d Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp.11632-11641, 2020. 10.1109\/cvpr42600.2020.01165","DOI":"10.1109\/CVPR42600.2020.01165"},{"key":"15","doi-asserted-by":"crossref","unstructured":"[15] M. Rad and V. Lepetit, \u201cBb8: A scalable, accurate, robust to partial occlusion method for predicting the 3d poses of challenging objects without using depth,\u201d Proc. IEEE International Conference on Computer Vision, pp.3828-3836, 2017. 10.1109\/iccv.2017.413","DOI":"10.1109\/ICCV.2017.413"},{"key":"16","doi-asserted-by":"crossref","unstructured":"[16] W. Kehl, F. Manhardt, F. Tombari, S. Ilic, and N. Navab, \u201cSsd-6d: Making rgb-based 3d detection and 6d pose estimation great again,\u201d Proc. IEEE International Conference on Computer Vision, pp.1521-1529, 2017. 10.1109\/iccv.2017.169","DOI":"10.1109\/ICCV.2017.169"},{"key":"17","doi-asserted-by":"crossref","unstructured":"[17] C. Capellen, M. Schwarz, and S. Behnke, \u201cConvposecnn: Dense convolutional 6d object pose estimation,\u201d arXiv preprint arXiv:1912.07333, 2019.","DOI":"10.5220\/0008990901620172"},{"key":"18","doi-asserted-by":"crossref","unstructured":"[18] P. Besl and H. McKay, \u201cA method for registration of 3-d shapes,\u201d IEEE Trans. Pattern Anal. Mach. Intell., vol.14, pp.239-256, 03 1992.","DOI":"10.1109\/34.121791"},{"key":"19","doi-asserted-by":"crossref","unstructured":"[19] B. Tekin, S.N. Sinha, and P. Fua, \u201cReal-time seamless single shot 6d object pose prediction,\u201d Proc. IEEE Conference on Computer Vision and Pattern Recognition, pp.292-301, 2018. 10.1109\/cvpr.2018.00038","DOI":"10.1109\/CVPR.2018.00038"},{"key":"20","doi-asserted-by":"crossref","unstructured":"[20] M. Oberweger, M. Rad, and V. Lepetit, \u201cMaking deep heatmaps robust to partial occlusions for 3d object pose estimation,\u201d Proc. European Conference on Computer Vision (ECCV), pp.119-134, 2018. 10.1007\/978-3-030-01267-0_8","DOI":"10.1007\/978-3-030-01267-0_8"},{"key":"21","doi-asserted-by":"crossref","unstructured":"[21] O.H. Jafari, S.K. Mustikovela, K. Pertsch, E. Brachmann, and C. Rother, \u201cipose: instance-aware 6d pose estimation of partly occluded objects,\u201d Asian Conference on Computer Vision, pp.477-492, Springer, 2018. 10.1007\/978-3-030-20893-6_30","DOI":"10.1007\/978-3-030-20893-6_30"},{"key":"22","doi-asserted-by":"crossref","unstructured":"[22] E. Brachmann, F. Michel, A. Krull, M. Ying Yang, S. Gumhold, and C. Rother, \u201cUncertainty-driven 6d pose estimation of objects and scenes from a single rgb image,\u201d Proc. IEEE conference on computer vision and pattern recognition, pp.3364-3372, 2016. 10.1109\/cvpr.2016.366","DOI":"10.1109\/CVPR.2016.366"},{"key":"23","doi-asserted-by":"crossref","unstructured":"[23] K. Park, T. Patten, and M. Vincze, \u201cPix2pose: Pixel-wise coordinate regression of objects for 6d pose estimation,\u201d Proc. IEEE International Conference on Computer Vision, pp.7668-7677, 2019. 10.1109\/iccv.2019.00776","DOI":"10.1109\/ICCV.2019.00776"},{"key":"24","doi-asserted-by":"crossref","unstructured":"[24] R. Girshick, \u201cFast r-cnn,\u201d Proc. IEEE international conference on computer vision, pp.1440-1448, 2015. 10.1109\/iccv.2015.169","DOI":"10.1109\/ICCV.2015.169"},{"key":"25","doi-asserted-by":"crossref","unstructured":"[25] O. Ronneberger, P. Fischer, and T. Brox, \u201cU-net: Convolutional networks for biomedical image segmentation,\u201d International Conference on Medical image computing and computer-assisted intervention, pp.234-241, Springer, 2015. 10.1007\/978-3-319-24574-4_28","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"26","doi-asserted-by":"publisher","unstructured":"[26] X. Qin, Z. Zhang, C. Huang, M. Dehghan, O.R. Zaiane, and M. Jagersand, \u201cU2-net: Going deeper with nested u-structure for salient object detection,\u201d Pattern Recognition, vol.106, p.107404, 2020. 10.1016\/j.patcog.2020.107404","DOI":"10.1016\/j.patcog.2020.107404"},{"key":"27","doi-asserted-by":"crossref","unstructured":"[27] Z. Zhou, M.M.R. Siddiquee, N. Tajbakhsh, and J. Liang, \u201cUnet++: A nested u-net architecture for medical image segmentation,\u201d Deep Learning in Medical Image Analysis and Multimodal Learning for Clinical Decision Support, pp.3-11, Springer, 2018. 10.1007\/978-3-030-00889-5_1","DOI":"10.1007\/978-3-030-00889-5_1"},{"key":"28","doi-asserted-by":"publisher","unstructured":"[28] Z. Gu, J. Cheng, H. Fu, K. Zhou, H. Hao, Y. Zhao, T. Zhang, S. Gao, and J. Liu, \u201cCe-net: Context encoder network for 2d medical image segmentation,\u201d IEEE Trans. Med. Imag., vol.38, no.10, pp.2281-2292, 2019. 10.1109\/tmi.2019.2903562","DOI":"10.1109\/TMI.2019.2903562"},{"key":"29","unstructured":"[29] O. Oktay, J. Schlemper, L.L. Folgoc, M. Lee, M. Heinrich, K.Misawa, K. Mori, S. McDonagh, N.Y. Hammerla, B. Kainz, et al., \u201cAttention u-net: Learning where to look for the pancreas,\u201d arXiv preprint arXiv:1804.03999, 2018."},{"key":"30","doi-asserted-by":"crossref","unstructured":"[30] J. Hu, L. Shen, and G. Sun, \u201cSqueeze-and-excitation networks,\u201d Proc. IEEE conference on computer vision and pattern recognition, pp.7132-7141, 2018. 10.1109\/cvpr.2018.00745","DOI":"10.1109\/CVPR.2018.00745"},{"key":"31","unstructured":"[31] H. Li, P. Xiong, J. An, and L. Wang, \u201cPyramid attention network for semantic segmentation,\u201d arXiv preprint arXiv:1805.10180, 2018."},{"key":"32","doi-asserted-by":"crossref","unstructured":"[32] Q. Wang, B. Wu, P. Zhu, P. Li, W. Zuo, and Q. Hu, \u201cEca-net: Efficient channel attention for deep convolutional neural networks,\u201d Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp.11534-11542, 2020. 10.1109\/cvpr42600.2020.01155","DOI":"10.1109\/CVPR42600.2020.01155"},{"key":"33","doi-asserted-by":"crossref","unstructured":"[33] Z.-L. Ni, G.-B. Bian, X.-H. Zhou, Z.-G. Hou, X.-L. Xie, C. Wang, Y.-J. Zhou, R.-Q. Li, and Z. Li, \u201cRaunet: Residual attention u-net for semantic segmentation of cataract surgical instruments,\u201d International Conference on Neural Information Processing, pp.139-149, Springer, 2019. 10.1007\/978-3-030-36711-4_13","DOI":"10.1007\/978-3-030-36711-4_13"},{"key":"34","doi-asserted-by":"crossref","unstructured":"[34] S. Hinterstoisser, V. Lepetit, S. Ilic, S. Holzer, G. Bradski, K.Konolige, and N. Navab, \u201cModel based training, detection and pose estimation of texture-less 3d objects in heavily cluttered scenes,\u201d Asian conference on computer vision, pp.548-562, Springer, 2012. 10.1007\/978-3-642-37331-2_42","DOI":"10.1007\/978-3-642-37331-2_42"},{"key":"35","doi-asserted-by":"crossref","unstructured":"[35] E. Brachmann, A. Krull, F. Michel, S. Gumhold, J. Shotton, and C. Rother, \u201cLearning 6d object pose estimation using 3d object coordinates,\u201d European conference on computer vision, pp.536-551, Springer, 2014. 10.1007\/978-3-319-10605-2_35","DOI":"10.1007\/978-3-319-10605-2_35"},{"key":"36","doi-asserted-by":"crossref","unstructured":"[36] C. Gu and X. Ren, \u201cDiscriminative mixture-of-templates for viewpoint classification,\u201d European Conference on Computer Vision, pp.408-421, Springer, 2010. 10.1007\/978-3-642-15555-0_30","DOI":"10.1007\/978-3-642-15555-0_30"},{"key":"37","doi-asserted-by":"publisher","unstructured":"[37] S. Hinterstoisser, C. Cagniart, S. Ilic, P. Sturm, N. Navab, P. Fua, and V. Lepetit, \u201cGradient response maps for real-time detection of textureless objects,\u201d IEEE Trans. Pattern Anal. Mach. Intell., vol.34, no.5, pp.876-888, 2011. 10.1109\/tpami.2011.206","DOI":"10.1109\/TPAMI.2011.206"},{"key":"38","doi-asserted-by":"crossref","unstructured":"[38] A. Kendall, M. Grimes, and R. Cipolla, \u201cPosenet: A convolutional network for real-time 6-dof camera relocalization,\u201d Proc. IEEE international conference on computer vision, pp.2938-2946, 2015. 10.1109\/iccv.2015.336","DOI":"10.1109\/ICCV.2015.336"},{"key":"39","doi-asserted-by":"publisher","unstructured":"[39] V. Lepetit, F. Moreno-Noguer, and P. Fua, \u201cEpnp: An accurate o (n) solution to the pnp problem,\u201d International journal of computer vision, vol.81, no.2, p.155, 2009. 10.1007\/s11263-008-0152-6","DOI":"10.1007\/s11263-008-0152-6"},{"key":"40","doi-asserted-by":"crossref","unstructured":"[40] M. Aubry, D. Maturana, A.A. Efros, B.C. Russell, and J. Sivic, \u201cSeeing 3d chairs: exemplar part-based 2d-3d alignment using a large dataset of cad models,\u201d Proc. IEEE conference on computer vision and pattern recognition, pp.3762-3769, 2014. 10.1109\/cvpr.2014.487","DOI":"10.1109\/CVPR.2014.487"},{"key":"41","doi-asserted-by":"crossref","unstructured":"[41] J. Redmon, S. Divvala, R. Girshick, and A. Farhadi, \u201cYou only look once: Unified, real-time object detection,\u201d Proc. IEEE conference on computer vision and pattern recognition, pp.779-788, 2016. 10.1109\/cvpr.2016.91","DOI":"10.1109\/CVPR.2016.91"},{"key":"42","doi-asserted-by":"crossref","unstructured":"[42] K. He, G. Gkioxari, P. Doll\u00e1r, and R. Girshick, \u201cMask r-cnn,\u201d Proc. IEEE international conference on computer vision, pp.2961-2969, 2017. 10.1109\/iccv.2017.322","DOI":"10.1109\/ICCV.2017.322"},{"key":"43","doi-asserted-by":"crossref","unstructured":"[43] H. Zhang, K. Dana, J. Shi, Z. Zhang, X. Wang, A. Tyagi, and A. Agrawal, \u201cContext encoding for semantic segmentation,\u201d Proc. IEEE conference on Computer Vision and Pattern Recognition, pp.7151-7160, 2018. 10.1109\/cvpr.2018.00747","DOI":"10.1109\/CVPR.2018.00747"},{"key":"44","doi-asserted-by":"crossref","unstructured":"[44] C. Yu, J. Wang, C. Peng, C. Gao, G. Yu, and N. Sang, \u201cBisenet: Bilateral segmentation network for real-time semantic segmentation,\u201d Proc. European conference on computer vision (ECCV), pp.325-341, 2018. 10.1007\/978-3-030-01261-8_20","DOI":"10.1007\/978-3-030-01261-8_20"},{"key":"45","doi-asserted-by":"crossref","unstructured":"[45] C. Yu, J. Wang, C. Peng, C. Gao, G. Yu, and N. Sang, \u201cLearning a discriminative feature network for semantic segmentation,\u201d Proc. IEEE conference on computer vision and pattern recognition, pp.1857-1866, 2018. 10.1109\/cvpr.2018.00199","DOI":"10.1109\/CVPR.2018.00199"},{"key":"46","doi-asserted-by":"crossref","unstructured":"[46] L.-C. Chen, Y. Yang, J. Wang, W. Xu, and A.L. Yuille, \u201cAttention to scale: Scale-aware semantic image segmentation,\u201d Proc. IEEE conference on computer vision and pattern recognition, pp.3640-3649, 2016. 10.1109\/cvpr.2016.396","DOI":"10.1109\/CVPR.2016.396"},{"key":"47","doi-asserted-by":"crossref","unstructured":"[47] H. Zhao, Y. Zhang, S. Liu, J. Shi, C. Change Loy, D. Lin, and J. Jia, \u201cPsanet: Point-wise spatial attention network for scene parsing,\u201d Proc. European Conference on Computer Vision (ECCV), pp.267-283, 2018. 10.1007\/978-3-030-01240-3_17","DOI":"10.1007\/978-3-030-01240-3_17"},{"key":"48","doi-asserted-by":"crossref","unstructured":"[48] C. Yu, J. Wang, C. Gao, G. Yu, C. Shen, and N. Sang, \u201cContext prior for scene segmentation,\u201d Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp.12416-12425, 2020. 10.1109\/cvpr42600.2020.01243","DOI":"10.1109\/CVPR42600.2020.01243"},{"key":"49","doi-asserted-by":"crossref","unstructured":"[49] X. Li, Z. Zhong, J. Wu, Y. Yang, Z. Lin, and H. Liu, \u201cExpectation-maximization attention networks for semantic segmentation,\u201d Proc. IEEE International Conference on Computer Vision, pp.9167-9176, 2019. 10.1109\/iccv.2019.00926","DOI":"10.1109\/ICCV.2019.00926"},{"key":"50","unstructured":"[50] C.R. Qi, L. Yi, H. Su, and L.J. Guibas, \u201cPointnet++: Deep hierarchical feature learning on point sets in a metric space,\u201d Advances in neural information processing systems, pp.5099-5108, 2017."},{"key":"51","doi-asserted-by":"crossref","unstructured":"[51] K. He, X. Zhang, S. Ren, and J. Sun, \u201cDeep residual learning for image recognition,\u201d Proc. IEEE conference on computer vision and pattern recognition, pp.770-778, 2016. 10.1109\/cvpr.2016.90","DOI":"10.1109\/CVPR.2016.90"}],"container-title":["IEICE Transactions on Information and Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E104.D\/7\/E104.D_2020EDP7235\/_pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,3]],"date-time":"2024-09-03T01:41:17Z","timestamp":1725327677000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E104.D\/7\/E104.D_2020EDP7235\/_article"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,7,1]]},"references-count":51,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2021]]}},"URL":"https:\/\/doi.org\/10.1587\/transinf.2020edp7235","relation":{},"ISSN":["0916-8532","1745-1361"],"issn-type":[{"value":"0916-8532","type":"print"},{"value":"1745-1361","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,7,1]]},"article-number":"2020EDP7235"}}