{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T15:36:28Z","timestamp":1759332988160,"version":"3.37.3"},"reference-count":40,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2024,8,30]],"date-time":"2024-08-30T00:00:00Z","timestamp":1724976000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,8,30]],"date-time":"2024-08-30T00:00:00Z","timestamp":1724976000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2021YFB1714800","2021YFB1714800","2021YFB1714800","2021YFB1714800","2021YFB1714800"],"award-info":[{"award-number":["2021YFB1714800","2021YFB1714800","2021YFB1714800","2021YFB1714800","2021YFB1714800"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int. J. Mach. Learn. &amp; Cyber."],"published-print":{"date-parts":[[2025,2]]},"DOI":"10.1007\/s13042-024-02336-8","type":"journal-article","created":{"date-parts":[[2024,8,30]],"date-time":"2024-08-30T11:03:18Z","timestamp":1725015798000},"page":"1327-1340","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Multi-modal 6-DoF object pose tracking: integrating spatial cues with monocular RGB imagery"],"prefix":"10.1007","volume":"16","author":[{"given":"Yunpeng","family":"Mei","sequence":"first","affiliation":[]},{"given":"Shuze","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Zhuo","family":"Li","sequence":"additional","affiliation":[]},{"given":"Jian","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Gang","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,8,30]]},"reference":[{"issue":"4","key":"2336_CR1","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3524496","volume":"55","author":"Z Fan","year":"2022","unstructured":"Fan Z, Zhu Y, He Y, Sun Q, Liu H, He J (2022) Deep learning on monocular object pose detection and tracking: a comprehensive overview. ACM Comput Surv 55(4):1\u201340","journal-title":"ACM Comput Surv"},{"key":"2336_CR2","doi-asserted-by":"crossref","unstructured":"Rad M, Lepetit V (2017) Bb8: a scalable, accurate, robust to partial occlusion method for predicting the 3d poses of challenging objects without using depth. In: Proceedings of the IEEE international conference on computer vision, pp 3828\u20133836","DOI":"10.1109\/ICCV.2017.413"},{"key":"2336_CR3","doi-asserted-by":"crossref","unstructured":"Kehl W, Manhardt F, Tombari F, Ilic S, Navab N (2017) Ssd-6d: making rgb-based 3d detection and 6d pose estimation great again. In: Proceedings of the IEEE international conference on computer vision, pp 1521\u20131529","DOI":"10.1109\/ICCV.2017.169"},{"key":"2336_CR4","doi-asserted-by":"crossref","unstructured":"Xiang Y, Schmidt T, Narayanan V, Fox D (2017) Posecnn: A convolutional neural network for 6d object pose estimation in cluttered scenes. arXiv preprint arXiv:1711.00199","DOI":"10.15607\/RSS.2018.XIV.019"},{"key":"2336_CR5","doi-asserted-by":"crossref","unstructured":"Li B, Ouyang W, Sheng L, Zeng X, Wang X (2019) Gs3d: an efficient 3d object detection framework for autonomous driving. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 1019\u20131028","DOI":"10.1109\/CVPR.2019.00111"},{"key":"2336_CR6","unstructured":"Ren S, He K, Girshick R, Sun J (2015) Faster r-cnn: towards real-time object detection with region proposal networks. Adv Neural Inf Process Syst 28"},{"key":"2336_CR7","doi-asserted-by":"crossref","unstructured":"Liu W, Anguelov D, Erhan D, Szegedy C, Reed S, Fu C-Y, Berg AC (2016) Ssd: single shot multibox detector. In: Computer vision\u2013ECCV 2016: 14th European conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part I 14. Springer, pp 21\u201337","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"2336_CR8","doi-asserted-by":"crossref","unstructured":"Weng X, Wang J, Held D, Kitani K (2020) 3d multi-object tracking: a baseline and new evaluation metrics. In: 2020 IEEE\/RSJ international conference on intelligent robots and systems. IEEE, pp 10359\u201310366","DOI":"10.1109\/IROS45743.2020.9341164"},{"key":"2336_CR9","doi-asserted-by":"crossref","unstructured":"Weng X, Yuan Y, Kitani K (2020) Joint 3d tracking and forecasting with graph neural network and diversity sampling. 2(6.2):1. arXiv preprint arXiv:2003.07847","DOI":"10.1109\/LRA.2021.3068925"},{"issue":"2","key":"2336_CR10","doi-asserted-by":"publisher","first-page":"667","DOI":"10.1007\/s13042-023-01933-3","volume":"15","author":"Q Fu","year":"2024","unstructured":"Fu Q, Xie K, Wen C, He J, Zhang W, Tian H, Yang S (2024) Adaptive occlusion hybrid second-order attention network for head pose estimation. Int J Mach Learn Cybern 15(2):667\u2013683","journal-title":"Int J Mach Learn Cybern"},{"key":"2336_CR11","doi-asserted-by":"crossref","unstructured":"Trabelsi A, Chaabane M, Blanchard N, Beveridge R (2021) A pose proposal and refinement network for better 6d object pose estimation. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision, pp 2382\u20132391","DOI":"10.1109\/WACV48630.2021.00243"},{"key":"2336_CR12","doi-asserted-by":"crossref","unstructured":"Hodan T, Barath D, Matas J (2020) Epos: estimating 6d pose of objects with symmetries. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 11703\u201311712","DOI":"10.1109\/CVPR42600.2020.01172"},{"key":"2336_CR13","unstructured":"Tremblay J, To T, Sundaralingam B, Xiang Y, Fox D, Birchfield S (2018) Deep object pose estimation for semantic robotic grasping of household objects. arXiv preprint arXiv:1809.10790"},{"key":"2336_CR14","doi-asserted-by":"crossref","unstructured":"Cao Z, Simon T, Wei S.-E, Sheikh Y (2017) Realtime multi-person 2d pose estimation using part affinity fields. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 7291\u20137299","DOI":"10.1109\/CVPR.2017.143"},{"key":"2336_CR15","doi-asserted-by":"crossref","unstructured":"Lin Y, Tremblay J, Tyree S, Vela P.A, Birchfield S (2022) Keypoint-based category-level object pose tracking from an rgb sequence with uncertainty estimation. In: 2022 International conference on robotics and automation (ICRA). IEEE, pp 1258\u20131264","DOI":"10.1109\/ICRA46639.2022.9811720"},{"key":"2336_CR16","doi-asserted-by":"crossref","unstructured":"Yu F, Wang D, Shelhamer E, Darrell T (2018) Deep layer aggregation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 2403\u20132412","DOI":"10.1109\/CVPR.2018.00255"},{"issue":"4","key":"2336_CR17","doi-asserted-by":"publisher","first-page":"923","DOI":"10.1007\/s13042-019-01056-8","volume":"11","author":"Z Wang","year":"2020","unstructured":"Wang Z, Zhou X, Wang W, Liang C (2020) Emotion recognition using multimodal deep learning in multiple psychophysiological signals and video. Int J Mach Learn Cybern 11(4):923\u2013934","journal-title":"Int J Mach Learn Cybern"},{"key":"2336_CR18","doi-asserted-by":"crossref","unstructured":"Wang S, Zhang X, Luo Z, Wang Y (2023) Multimodal sparse support tensor machine for multiple classification learning. Int J Mach Learn Cybern:1\u201313","DOI":"10.1007\/s13042-023-01972-w"},{"key":"2336_CR19","doi-asserted-by":"crossref","unstructured":"He Y, Huang H, Fan H, Chen Q, Sun J (2021) Ffb6d: a full flow bidirectional fusion network for 6d pose estimation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 3003\u20133013","DOI":"10.1109\/CVPR46437.2021.00302"},{"key":"2336_CR20","doi-asserted-by":"crossref","unstructured":"Deilamsalehy H, Havens TC (2016) Sensor fused three-dimensional localization using imu, camera and lidar. In: 2016 IEEE sensors. IEEE, pp 1\u20133","DOI":"10.1109\/ICSENS.2016.7808523"},{"key":"2336_CR21","doi-asserted-by":"crossref","unstructured":"Tsai Y-HH, Bai S, Liang PP, Kolter JZ, Morency L-P, Salakhutdinov R (2019) Multimodal transformer for unaligned multimodal language sequences. In: Proceedings of the conference association for computational linguistics meeting, vol 2019. NIH Public Access, p 6558","DOI":"10.18653\/v1\/P19-1656"},{"key":"2336_CR22","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S et al (2020) An image is worth 16 $$\\times$$ 16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929"},{"key":"2336_CR23","doi-asserted-by":"crossref","unstructured":"Carion N, Massa F, Synnaeve G, Usunier N, Kirillov A, Zagoruyko (2020) End-to-end object detection with transformers. In: European conference on computer vision. Springer, pp 213\u2013229","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"2336_CR24","doi-asserted-by":"crossref","unstructured":"Zheng S, Lu J, Zhao H, Zhu X, Luo Z, Wang Y, Fu Y, Feng J, Xiang T, Torr PH et al (2021) Rethinking semantic segmentation from a sequence-to-sequence perspective with transformers. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 6881\u20136890","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"2336_CR25","doi-asserted-by":"crossref","unstructured":"Zhou L, Zhou Y, Corso JJ, Socher R, Xiong C (2018) End-to-end dense video captioning with masked transformer. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 8739\u20138748","DOI":"10.1109\/CVPR.2018.00911"},{"issue":"8","key":"2336_CR26","doi-asserted-by":"publisher","first-page":"9822","DOI":"10.1109\/TPAMI.2021.3125981","volume":"45","author":"J Yin","year":"2021","unstructured":"Yin J, Shen J, Gao X, Crandall DJ, Yang R (2021) Graph neural network and spatiotemporal transformer attention for 3d video object detection from point clouds. IEEE Trans Pattern Anal Mach Intell 45(8):9822\u20139835","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"2336_CR27","unstructured":"Jantos T.G, Hamdad M.A, Granig W, Weiss S, Steinbrener J (2023) Poet: pose estimation transformer for single-view, multi-object 6d pose estimation. In: Conference on robot learning. PMLR, pp 1060\u20131070"},{"key":"2336_CR28","doi-asserted-by":"publisher","first-page":"1665","DOI":"10.1109\/TMM.2023.3284598","volume":"26","author":"S Yu","year":"2024","unstructured":"Yu S, Zhai D-H, Xia Y, Li D, Zhao S (2024) Cattrack: single-stage category-level 6d object pose tracking via convolution and vision transformer. IEEE Trans Multimedia 26:1665\u20131680. https:\/\/doi.org\/10.1109\/TMM.2023.3284598","journal-title":"IEEE Trans Multimedia"},{"key":"2336_CR29","doi-asserted-by":"crossref","unstructured":"Ronneberger O, Fischer P, Brox T (2015) U-net: convolutional networks for biomedical image segmentation. In: Medical image computing and computer-assisted intervention\u2014MICCAI 2015: 18th international conference, Munich, Germany, October 5-9, Proceedings, Part III 18. Springer, pp 234\u2013241","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"2336_CR30","unstructured":"Chen J, Lu Y, Yu Q, Luo X, Adeli E, Wang Y, Lu L, Yuille AL, Zhou Y (2021) Transunet: transformers make strong encoders for medical image segmentation. arXiv preprint arXiv:2102.04306"},{"issue":"2","key":"2336_CR31","doi-asserted-by":"publisher","first-page":"103","DOI":"10.14358\/PERS.81.2.103","volume":"81","author":"YI Abdel-Aziz","year":"2015","unstructured":"Abdel-Aziz YI, Karara HM, Hauck M (2015) Direct linear transformation from comparator coordinates into object space coordinates in close-range photogrammetry. Photogramm Eng Remote Sens 81(2):103\u2013107","journal-title":"Photogramm Eng Remote Sens"},{"key":"2336_CR32","doi-asserted-by":"crossref","unstructured":"Ahmadyan A, Zhang L, Ablavatski A, Wei J, Grundmann M (2021) Objectron: a large scale dataset of object-centric videos in the wild with pose annotations. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 7822\u20137831","DOI":"10.1109\/CVPR46437.2021.00773"},{"key":"2336_CR33","unstructured":"Hou T, Ahmadyan A, Zhang L, Wei J, Grundmann M (2020) Mobilepose: real-time pose estimation for unseen objects with weak shape supervision. arXiv preprint arXiv:2003.03522"},{"key":"2336_CR34","unstructured":"Tan M, Le Q (2019) Efficientnet: rethinking model scaling for convolutional neural networks. In: International conference on machine learning. PMLR, pp 6105\u20136114"},{"key":"2336_CR35","doi-asserted-by":"crossref","unstructured":"Lin Y, Tremblay J, Tyree S, Vela PA, Birchfield S (2022) Single-stage keypoint-based category-level object pose estimation from an rgb image. In: International conference on robotics and automation (ICRA). IEEE, pp 1547\u20131553","DOI":"10.1109\/ICRA46639.2022.9812299"},{"key":"2336_CR36","doi-asserted-by":"crossref","unstructured":"Wang C, Mart\u00edn-Mart\u00edn R, Xu D, Lv J, Lu C, Fei-Fei L, Savarese S, Zhu Y (2020) 6-pack: category-level 6d pose tracker with anchor-based keypoints. In: 2020 IEEE international conference on robotics and automation (ICRA). IEEE, pp 10059\u201310066","DOI":"10.1109\/ICRA40945.2020.9196679"},{"key":"2336_CR37","doi-asserted-by":"crossref","unstructured":"Lin Y, Tremblay J, Tyree S, Vela PA, Birchfield S (2022) Single-stage keypoint-based category-level object pose estimation from an rgb image. In: 2022 International conference on robotics and automation (ICRA). IEEE, pp 1547\u20131553","DOI":"10.1109\/ICRA46639.2022.9812299"},{"key":"2336_CR38","doi-asserted-by":"crossref","unstructured":"Issac J, W\u00fcthrich M, Cifuentes CG, Bohg J, Trimpe S, Schaal S (2016) Depth-based object tracking using a robust gaussian filter. In: 2016 IEEE international conference on robotics and automation (ICRA). IEEE, pp 608\u2013615","DOI":"10.1109\/ICRA.2016.7487184"},{"key":"2336_CR39","unstructured":"Loshchilov I, Hutter F (2018) Fixing weight decay regularization in adam. arXiv preprint arXiv:1711.05101"},{"key":"2336_CR40","doi-asserted-by":"crossref","unstructured":"Zhou X, Koltun V, Kr\u00e4henb\u00fchl P (2020) Tracking objects as points. In: European conference on computer vision. Springer, pp 474\u2013490","DOI":"10.1007\/978-3-030-58548-8_28"}],"container-title":["International Journal of Machine Learning and Cybernetics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13042-024-02336-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13042-024-02336-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13042-024-02336-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,4]],"date-time":"2025-02-04T10:41:46Z","timestamp":1738665706000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13042-024-02336-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,30]]},"references-count":40,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,2]]}},"alternative-id":["2336"],"URL":"https:\/\/doi.org\/10.1007\/s13042-024-02336-8","relation":{},"ISSN":["1868-8071","1868-808X"],"issn-type":[{"type":"print","value":"1868-8071"},{"type":"electronic","value":"1868-808X"}],"subject":[],"published":{"date-parts":[[2024,8,30]]},"assertion":[{"value":"15 March 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 August 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 August 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}