{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,3]],"date-time":"2026-03-03T22:19:40Z","timestamp":1772576380046,"version":"3.50.1"},"reference-count":52,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2019,10,23]],"date-time":"2019-10-23T00:00:00Z","timestamp":1571788800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2019,10,23]],"date-time":"2019-10-23T00:00:00Z","timestamp":1571788800000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100002946","name":"Deutsches Zentrum f\u00fcr Luft- und Raumfahrt","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100002946","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2020,3]]},"DOI":"10.1007\/s11263-019-01243-8","type":"journal-article","created":{"date-parts":[[2019,10,23]],"date-time":"2019-10-23T16:32:44Z","timestamp":1571848364000},"page":"714-729","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":121,"title":["Augmented Autoencoders: Implicit 3D Orientation Learning for 6D Object Detection"],"prefix":"10.1007","volume":"128","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0587-9643","authenticated-orcid":false,"given":"Martin","family":"Sundermeyer","sequence":"first","affiliation":[]},{"given":"Zoltan-Csaba","family":"Marton","sequence":"additional","affiliation":[]},{"given":"Maximilian","family":"Durner","sequence":"additional","affiliation":[]},{"given":"Rudolph","family":"Triebel","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,10,23]]},"reference":[{"key":"1243_CR1","unstructured":"Balntas, V., Doumanoglou, A., Sahin, C., Sock, J., Kouskouridas, R., & Kim, T. K. (2017). Pose guided RGB-D feature learning for 3D object pose estimation. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 3856\u20133864)."},{"key":"1243_CR2","doi-asserted-by":"crossref","unstructured":"Bousmalis, K., Irpan, A., Wohlhart, P., Bai, Y., Kelcey, M., Kalakrishnan, M., Downs, L., Ibarz, J., Pastor, P., Konolige, K., et\u00a0al. (2017a). Using simulation and domain adaptation to improve efficiency of deep robotic grasping. arXiv preprint \narXiv:170907857\n\n.","DOI":"10.1109\/ICRA.2018.8460875"},{"key":"1243_CR3","doi-asserted-by":"crossref","unstructured":"Bousmalis, K., Silberman, N., Dohan, D., Erhan, D., & Krishnan, D. (2017b). Unsupervised pixel-level domain adaptation with generative adversarial networks. In The IEEE conference on computer vision and pattern recognition (CVPR) (Vol. 1, p. 7).","DOI":"10.1109\/CVPR.2017.18"},{"key":"1243_CR4","doi-asserted-by":"crossref","unstructured":"Brachmann, E., Michel, F., Krull, A., Ying\u00a0Yang, M., Gumhold, S., et\u00a0al. (2016). Uncertainty-driven 6D pose estimation of objects and scenes from a single RGB image. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 3364\u20133372).","DOI":"10.1109\/CVPR.2016.366"},{"issue":"3","key":"1243_CR5","doi-asserted-by":"publisher","first-page":"145","DOI":"10.1016\/0262-8856(92)90066-C","volume":"10","author":"Y Chen","year":"1992","unstructured":"Chen, Y., & Medioni, G. (1992). Object modelling by registration of multiple range images. Image and Vision Computing, 10(3), 145\u2013155.","journal-title":"Image and Vision Computing"},{"key":"1243_CR6","doi-asserted-by":"crossref","unstructured":"Csurka, G. (2017). Domain adaptation for visual applications: A comprehensive survey. arXiv preprint \narXiv:170205374\n\n.","DOI":"10.1007\/978-3-319-58347-1"},{"key":"1243_CR7","doi-asserted-by":"crossref","unstructured":"Drost, B., Ulrich, M., Navab, N., & Ilic, S. (2010). Model globally, match locally: Efficient and robust 3D object recognition. In 2010 IEEE computer society conference on computer vision and pattern recognition, IEEE (pp. 998\u20131005).","DOI":"10.1109\/CVPR.2010.5540108"},{"key":"1243_CR8","unstructured":"Everingham, M., Van\u00a0Gool, L., Williams, C. K. I., Winn, J., & Zisserman, A. (2012). The PASCAL visual object classes challenge 2012 (VOC2012) results. \nhttp:\/\/host.robots.ox.ac.uk\/pascal\/VOC\/voc2012\/results\/index.html\n\n."},{"key":"1243_CR9","unstructured":"Glorot, X., & Bengio, Y. (2010). Understanding the difficulty of training deep feedforward neural networks. In Proceedings of the thirteenth international conference on artificial intelligence and statistics (pp. 249\u2013256)."},{"key":"1243_CR10","doi-asserted-by":"crossref","unstructured":"Hinterstoisser, S., Benhimane, S., Lepetit, V., Fua, P., & Navab, N. (2008). Simultaneous recognition and homography extraction of local patches with a simple linear classifier. In Proceedings of the British machine conference (pp. 1\u201310).","DOI":"10.5244\/C.22.10"},{"key":"1243_CR11","doi-asserted-by":"crossref","unstructured":"Hinterstoisser, S., Holzer, S., Cagniart, C., Ilic, S., Konolige, K., Navab, N., & Lepetit, V. (2011). Multimodal templates for real-time detection of texture-less objects in heavily cluttered scenes. In 2011 IEEE international conference on computer vision (ICCV), IEEE (pp. 858\u2013865).","DOI":"10.1109\/ICCV.2011.6126326"},{"issue":"5","key":"1243_CR12","doi-asserted-by":"publisher","first-page":"876","DOI":"10.1109\/TPAMI.2011.206","volume":"34","author":"S Hinterstoisser","year":"2012","unstructured":"Hinterstoisser, S., Cagniart, C., Ilic, S., Sturm, P., Navab, N., Fua, P., et al. (2012a). Gradient response maps for real-time detection of textureless objects. IEEE Transactions on Pattern Analysis and Machine Intelligence, 34(5), 876\u2013888.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1243_CR13","doi-asserted-by":"crossref","unstructured":"Hinterstoisser, S., Lepetit, V., Ilic, S., Holzer, S., Bradski, G., Konolige, K., & Navab, N. (2012b) Model based training, detection and pose estimation of texture-less 3D objects in heavily cluttered scenes. In Asian conference on computer vision, Springer (pp 548\u2013562)","DOI":"10.1007\/978-3-642-37331-2_42"},{"key":"1243_CR14","doi-asserted-by":"crossref","unstructured":"Hinterstoisser, S., Lepetit, V., Rajkumar, N., & Konolige, K. (2016) Going further with point pair features. In European conference on computer vision, Springer (pp. 834\u2013848)","DOI":"10.1007\/978-3-319-46487-9_51"},{"key":"1243_CR15","unstructured":"Hinterstoisser, S., Lepetit, V., Wohlhart, P., & Konolige, K. (2017) On pre-trained image features and synthetic images for deep learning. arXiv preprint \narXiv:171010710\n\n."},{"key":"1243_CR16","unstructured":"Hodan, T. (2017). SIXD Challenge 2017. \nhttp:\/\/cmp.felk.cvut.cz\/sixd\/challenge_2017\/\n\n. Accessed 7 Oct 2019."},{"key":"1243_CR17","doi-asserted-by":"crossref","unstructured":"Hoda\u0148, T., Matas, J., & Obdr\u017e\u00e1lek, \u0160. (2016). On evaluation of 6D object pose estimation. In European conference on computer vision, Springer (pp. 606\u2013619).","DOI":"10.1007\/978-3-319-49409-8_52"},{"key":"1243_CR18","doi-asserted-by":"crossref","unstructured":"Hoda\u0148, T., Haluza, P., Obdr\u017e\u00e1lek, \u0160., Matas, J., Lourakis, M., & Zabulis, X. (2017). T-LESS: An RGB-D dataset for 6D pose estimation of texture-less objects. In IEEE winter conference on applications of computer vision (WACV).","DOI":"10.1109\/WACV.2017.103"},{"key":"1243_CR19","doi-asserted-by":"publisher","first-page":"19","DOI":"10.1007\/978-3-030-01249-6_2","volume-title":"Computer Vision \u2013 ECCV 2018","author":"Tom\u00e1\u0161 Hoda\u0148","year":"2018","unstructured":"Hodan, T., Michel, F., Brachmann, E., Kehl, W., GlentBuch, A., Kraft, D., Drost, B., Vidal, J., Ihrke, S., Zabulis, X., et\u00a0al. (2018) Bop: Benchmark for 6D object pose estimation. In Proceedings of the European conference on computer vision (ECCV) (pp. 19\u201334)."},{"key":"1243_CR20","doi-asserted-by":"crossref","unstructured":"Hodan, T., Vineet, V., Gal, R., Shalev, E., Hanzelka, J., Connell, T., Urbina, P., Sinha, S. N., & Guenter, B. K. (2019) Photorealistic image synthesis for object instance detection. \narXiv:1902.03334\n\n.","DOI":"10.1109\/ICIP.2019.8803821"},{"key":"1243_CR21","unstructured":"Howard, A. G., Zhu, M., Chen, B., Kalenichenko, D., Wang, W., Weyand, T., Andreetto, M., & Adam, H. (2017). Mobilenets: Efficient convolutional neural networks for mobile vision applications. arXiv preprint \narXiv:170404861\n\n."},{"key":"1243_CR22","doi-asserted-by":"crossref","unstructured":"Kehl, W., Milletari, F., Tombari, F., Ilic, S., & Navab, N. (2016). Deep learning of local RGB-D patches for 3D object detection and 6D pose estimation. In European conference on computer vision, Springer (pp. 205\u2013220).","DOI":"10.1007\/978-3-319-46487-9_13"},{"key":"1243_CR23","doi-asserted-by":"crossref","unstructured":"Kehl, W., Manhardt, F., Tombari, F., Ilic, S., & Navab, N. (2017) SSD-6D: Making RGB-based 3D detection and 6D pose estimation great again. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 1521\u20131529)","DOI":"10.1109\/ICCV.2017.169"},{"key":"1243_CR24","unstructured":"Kingma, D., & Ba, J. (2014) Adam: A method for stochastic optimization. arXiv preprint \narXiv:14126980\n\n."},{"key":"1243_CR25","doi-asserted-by":"crossref","unstructured":"Lin, T. Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C. L. (2014) Microsoft coco: Common objects in context. In European conference on computer vision, Springer (pp. 740\u2013755).","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"1243_CR26","unstructured":"Lin, T. Y., Goyal, P., Girshick, R., He, K., & Doll\u00e1r, P. (2017). Focal loss for dense object detection. In: Proceedings of the IEEE international conference on computer vision (pp. 2980\u20132988)."},{"key":"1243_CR27","doi-asserted-by":"crossref","unstructured":"Liu, W., Anguelov, D., Erhan, D., Szegedy, C., Reed, S., Fu, C. Y., & Berg, A. C. (2016) SSD: Single shot multibox detector. In European conference on computer vision, Springer (pp. 21\u201337).","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"1243_CR28","unstructured":"Mahendran, S., Ali, H., & Vidal, R. (2017). 3D pose regression using convolutional neural networks. arXiv preprint \narXiv:170805628\n\n."},{"key":"1243_CR29","doi-asserted-by":"crossref","unstructured":"Manhardt, F., Kehl, W., Navab, N., & Tombari, F. (2018). Deep model-based 6D pose refinement in RGB. In The European conference on computer vision (ECCV)","DOI":"10.1007\/978-3-030-01264-9_49"},{"key":"1243_CR30","unstructured":"Matthey, L., Higgins, I., Hassabis, D., & Lerchner, A. (2017). dsprites: Disentanglement testing Sprites dataset. \nhttps:\/\/github.com\/deepmind\/dsprites-dataset\/\n\n."},{"key":"1243_CR31","doi-asserted-by":"crossref","unstructured":"Mitash, C., Bekris, K. E., & Boularias, A. (2017). A self-supervised learning system for object detection using physics simulation and multi-view pose estimation. In 2017 IEEE\/RSJ international conference on intelligent robots and systems (IROS), IEEE (pp. 545\u2013551).","DOI":"10.1109\/IROS.2017.8202206"},{"key":"1243_CR32","unstructured":"Movshovitz-Attias, Y., Kanade, T., & Sheikh, Y. (2016). How useful is photo-realistic rendering for visual learning? In European conference on computer vision, Springer (pp. 202\u2013217)."},{"issue":"6","key":"1243_CR33","doi-asserted-by":"publisher","first-page":"311","DOI":"10.1145\/360825.360839","volume":"18","author":"BT Phong","year":"1975","unstructured":"Phong, B. T. (1975). Illumination for computer generated pictures. Communications of the ACM, 18(6), 311\u2013317.","journal-title":"Communications of the ACM"},{"key":"1243_CR34","unstructured":"Rad, M., & Lepetit, V. (2017). BB8: A scalable, accurate, robust to partial occlusion method for predicting the 3D poses of challenging objects without using depth. arXiv preprint \narXiv:170310896\n\n."},{"key":"1243_CR35","unstructured":"Ren, S., He, K., Girshick, R., & Sun, J. (2015). Faster R-CNN: Towards real-time object detection with region proposal networks. In Advances in neural information processing systems (pp. 91\u201399)."},{"key":"1243_CR36","doi-asserted-by":"crossref","unstructured":"Richter, S. R., Vineet, V., Roth, S., & Koltun, V. (2016). Playing for data: Ground truth from computer games. In European conference on computer vision, Springer (pp. 102\u2013118).","DOI":"10.1007\/978-3-319-46475-6_7"},{"key":"1243_CR37","doi-asserted-by":"crossref","unstructured":"Rumelhart, D. E., Hinton, G. E., & Williams, R. J. (1985). Learning internal representations by error propagation. Technical report, California University, San Diego, La Jolla, Institute for Cognitive Science.","DOI":"10.21236\/ADA164453"},{"key":"1243_CR38","doi-asserted-by":"crossref","unstructured":"Saxena, A., Driemeyer, J., & Ng, A. Y. (2009). Learning 3D object orientation from images. In IEEE international conference on robotics and automation, 2009. ICRA\u201909. IEEE (pp. 794\u2013800).","DOI":"10.1109\/ROBOT.2009.5152855"},{"key":"1243_CR39","doi-asserted-by":"crossref","unstructured":"Shrivastava, A., Pfister, T., Tuzel, O., Susskind, J., Wang, W., & Webb, R. (2017). Learning from simulated and unsupervised images through adversarial training. In 2017 IEEE conference on computer vision and pattern recognition (CVPR), IEEE (pp. 2242\u20132251)","DOI":"10.1109\/CVPR.2017.241"},{"key":"1243_CR40","doi-asserted-by":"crossref","unstructured":"Su, H., Qi, C. R., Li, Y., & Guibas, L. J. (2015). Render for CNN: Viewpoint estimation in images using CNNs trained with rendered 3D model views. In Proceedings of the IEEE international conference on computer vision (pp. 2686\u20132694).","DOI":"10.1109\/ICCV.2015.308"},{"key":"1243_CR41","doi-asserted-by":"crossref","unstructured":"Sundermeyer, M., Marton, Z. C., Durner, M., Brucker, M., & Triebel, R. (2018). Implicit 3D orientation learning for 6D object detection from RGB images. In Proceedings of the European conference on computer vision (ECCV) (pp. 699\u2013715).","DOI":"10.1007\/978-3-030-01231-1_43"},{"key":"1243_CR42","unstructured":"Tekin, B., Sinha, S. N., & Fua, P. (2017). Real-time seamless single shot 6D object pose prediction. arXiv preprint \narXiv:171108848\n\n."},{"key":"1243_CR43","doi-asserted-by":"crossref","unstructured":"Tobin, J., Fong, R., Ray, A., Schneider, J., Zaremba, W., & Abbeel, P. (2017). Domain randomization for transferring deep neural networks from simulation to the real world. In 2017 IEEE\/RSJ international conference on intelligent robots and systems (IROS), IEEE (pp. 23\u201330).","DOI":"10.1109\/IROS.2017.8202133"},{"key":"1243_CR44","unstructured":"Tremblay, J., To, T., Sundaralingam, B., Xiang, Y., Fox, D., & Birchfield, S. (2018). Deep object pose estimation for semantic robotic grasping of household objects. In Conference on robot learning (pp. 306\u2013316)"},{"key":"1243_CR45","first-page":"1191","volume":"9","author":"M Ulrich","year":"2009","unstructured":"Ulrich, M., Wiedemann, C., & Steger, C. (2009). CAD-based recognition of 3D objects in monocular images. ICRA, 9, 1191\u20131198.","journal-title":"ICRA"},{"key":"1243_CR46","doi-asserted-by":"crossref","unstructured":"Vidal, J., Lin, C. Y., & Mart\u00ed, R. (2018) 6D pose estimation using an improved method based on point pair features. arXiv preprint \narXiv:180208516\n\n.","DOI":"10.1109\/ICCAR.2018.8384709"},{"issue":"Dec","key":"1243_CR47","first-page":"3371","volume":"11","author":"P Vincent","year":"2010","unstructured":"Vincent, P., Larochelle, H., Lajoie, I., Bengio, Y., & Manzagol, P. A. (2010). Stacked denoising autoencoders: Learning useful representations in a deep network with a local denoising criterion. Journal of Machine Learning Research, 11(Dec), 3371\u20133408.","journal-title":"Journal of Machine Learning Research"},{"key":"1243_CR48","doi-asserted-by":"crossref","unstructured":"Wohlhart, P., & Lepetit, V. (2015). Learning descriptors for object recognition and 3D pose estimation. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 3109\u20133118).","DOI":"10.1109\/CVPR.2015.7298930"},{"key":"1243_CR49","unstructured":"Wu, Z., Shen, C., & Hengel, A. (2016). Bridging category-level and instance-level semantic image segmentation. arXiv preprint \narXiv:160506885\n\n."},{"key":"1243_CR50","unstructured":"Xiang, Y., Schmidt, T., Narayanan, V., & Fox, D. (2017). Posecnn: A convolutional neural network for 6D object pose estimation in cluttered scenes. arXiv preprint \narXiv:171100199\n\n."},{"key":"1243_CR51","unstructured":"Zakharov, S., Shugurov, I., & Ilic, S. (2019). DPOD: Dense 6D pose object detector in RGB images. arXiv preprint \narXiv:190211020\n\n."},{"issue":"2","key":"1243_CR52","doi-asserted-by":"publisher","first-page":"119","DOI":"10.1007\/BF01427149","volume":"13","author":"Z Zhang","year":"1994","unstructured":"Zhang, Z. (1994). Iterative point matching for registration of free-form curves and surfaces. International Journal of Computer Vision, 13(2), 119\u2013152.","journal-title":"International Journal of Computer Vision"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-019-01243-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11263-019-01243-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-019-01243-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,10,21]],"date-time":"2020-10-21T23:15:23Z","timestamp":1603322123000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11263-019-01243-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,10,23]]},"references-count":52,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2020,3]]}},"alternative-id":["1243"],"URL":"https:\/\/doi.org\/10.1007\/s11263-019-01243-8","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019,10,23]]},"assertion":[{"value":"4 February 2019","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 September 2019","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 October 2019","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}