{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,23]],"date-time":"2026-05-23T02:04:33Z","timestamp":1779501873580,"version":"3.53.1"},"reference-count":34,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100006579","name":"Ministry of Industry and Information Technology of the People's Republic of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100006579","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition Letters"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1016\/j.patrec.2026.03.023","type":"journal-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T16:35:09Z","timestamp":1777480509000},"page":"109-115","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["LM2CNet: Enhancing monocular 3D visual grounding with language guided multi-modality coupling network"],"prefix":"10.1016","volume":"205","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-3630-6208","authenticated-orcid":false,"given":"Meng","family":"Li","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Qi","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shuchang","family":"Lyu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jun","family":"Jiang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Longhao","family":"Zou","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Guangliang","family":"Cheng","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.patrec.2026.03.023_bib0001","doi-asserted-by":"crossref","first-page":"3955","DOI":"10.1109\/TASE.2023.3290348","article-title":"FSNet: redesign self-supervised monodepth for full-scale depth prediction for autonomous driving","volume":"21","author":"Liu","year":"2023","journal-title":"IEEE Trans. Autom. Sci. Eng."},{"key":"10.1016\/j.patrec.2026.03.023_bib0002","doi-asserted-by":"crossref","first-page":"1548","DOI":"10.1109\/LRA.2023.3347131","article-title":"Hi-slam: monocular real-time dense mapping with hybrid implicit fields","volume":"9","author":"Zhang","year":"2023","journal-title":"IEEE Robot. Autom. Lett."},{"key":"10.1016\/j.patrec.2026.03.023_bib0003","unstructured":"Y. Liu, Roberta: a robustly optimized bert pretraining approach, (2019) arXiv: 1907.11692\">."},{"key":"10.1016\/j.patrec.2026.03.023_bib0004","unstructured":"J. Achiam, S. Adler, S. Agarwal, et al., Gpt-4 technical report, (2023) arXiv: 2303.08774\">."},{"key":"10.1016\/j.patrec.2026.03.023_bib0005","series-title":"North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT)","first-page":"4171","article-title":"BERT: pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019"},{"key":"10.1016\/j.patrec.2026.03.023_bib0006","series-title":"AAAI Conference on Artificial Intelligence(AAAI)","first-page":"6988","article-title":"Mono3dvg: 3d visual grounding in monocular images","author":"Zhan","year":"2024"},{"key":"10.1016\/j.patrec.2026.03.023_bib0007","series-title":"IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"1589","article-title":"Dimension embeddings for monocular 3d object detection","author":"Zhang","year":"2022"},{"key":"10.1016\/j.patrec.2026.03.023_bib0008","series-title":"AAAI Conference on Artificial Intelligence (AAAI)","first-page":"1810","article-title":"Learning auxiliary monocular contexts helps monocular 3d object detection","author":"Liu","year":"2022"},{"key":"10.1016\/j.patrec.2026.03.023_bib0009","series-title":"IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"1080","article-title":"Homography loss for monocular 3d object detection","author":"Gu","year":"2022"},{"key":"10.1016\/j.patrec.2026.03.023_bib0010","series-title":"IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"2791","article-title":"Diversity matters: fully exploiting depth clues for reliable monocular 3d object detection","author":"Li","year":"2022"},{"key":"10.1016\/j.patrec.2026.03.023_bib0011","series-title":"IEEE International Conference on Computer Vision (ICCV)","first-page":"3142","article-title":"Is pseudo-lidar needed for monocular 3d object detection?","author":"Park","year":"2021"},{"key":"10.1016\/j.patrec.2026.03.023_bib0012","series-title":"European Conference on Computer Vision (ECCV)","first-page":"664","article-title":"Deviant: depth equivariant network for monocular 3d object detection","author":"Kumar","year":"2022"},{"key":"10.1016\/j.patrec.2026.03.023_bib0013","series-title":"European Conference on Computer Vision (ECCV)","first-page":"71","article-title":"Did-m3d: decoupling instance depth for monocular 3d object detection","author":"Peng","year":"2022"},{"key":"10.1016\/j.patrec.2026.03.023_bib0014","series-title":"AAAI Conference on Artificial Intelligence (AAAI)","first-page":"6189","article-title":"FD3D: exploiting foreground depth map for feature-supervised monocular 3D object detection","author":"Wu","year":"2024"},{"key":"10.1016\/j.patrec.2026.03.023_bib0015","series-title":"AAAI Conference on Artificial Intelligence (AAAI)","first-page":"6467","article-title":"Geometry-guided domain generalization for monocular 3D object detection","author":"Yang","year":"2024"},{"key":"10.1016\/j.patrec.2026.03.023_bib0016","series-title":"IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"4135","article-title":"AEAM3D: adverse environment-adaptive monocular 3D object detection via feature extraction regularization","author":"Lei","year":"2024"},{"key":"10.1016\/j.patrec.2026.03.023_bib0017","series-title":"IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"17482","article-title":"Weakly supervised monocular 3d object detection using multi-view projection and direction consistency","author":"Tao","year":"2023"},{"key":"10.1016\/j.patrec.2026.03.023_bib0018","series-title":"IEEE International Conference on Computer Vision (ICCV)","first-page":"9155","article-title":"MonoDETR: depth-guided transformer for monocular 3D object detection","author":"Zhang","year":"2023"},{"key":"10.1016\/j.patrec.2026.03.023_bib0019","series-title":"Neural Information Processing Systems (NeuralIPS)","article-title":"Exploiting contextual objects and relations for 3d visual grounding","author":"Yang","year":"2023"},{"key":"10.1016\/j.patrec.2026.03.023_bib0020","series-title":"IEEE International Conference on Computer Vision (ICCV)","first-page":"2662","article-title":"Distilling coarse-to-fine semantic matching knowledge for weakly supervised 3d visual grounding","author":"Wang","year":"2023"},{"key":"10.1016\/j.patrec.2026.03.023_bib0021","series-title":"AAAI Conference on Artificial Intelligence (AAAI)","first-page":"3936","article-title":"Scaneru: interactive 3d visual grounding based on embodied reference understanding","author":"Lu","year":"2024"},{"key":"10.1016\/j.patrec.2026.03.023_bib0022","series-title":"IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"19231","article-title":"Eda: explicit text-decoupling and dense alignment for 3d visual grounding","author":"Wu","year":"2023"},{"key":"10.1016\/j.patrec.2026.03.023_bib0023","series-title":"IEEE International Conference on Computer Vision (ICCV)","first-page":"18109","article-title":"Unit3d: a unified transformer for 3d dense captioning and visual grounding","author":"Chen","year":"2023"},{"key":"10.1016\/j.patrec.2026.03.023_bib0024","series-title":"International Conference on Learning Representations (ICLR)","article-title":"CoT3DRef: chain-of-thoughts data-efficient 3D visual grounding","author":"Bakr","year":"2024"},{"key":"10.1016\/j.patrec.2026.03.023_bib0025","series-title":"IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"3354","article-title":"Are we ready for autonomous driving? The kitti vision benchmark suite","author":"Geiger","year":"2012"},{"key":"10.1016\/j.patrec.2026.03.023_bib0026","series-title":"IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"11621","article-title":"nuscenes: a multimodal dataset for autonomous driving","author":"Caesar","year":"2020"},{"key":"10.1016\/j.patrec.2026.03.023_bib0027","unstructured":"C. Sima, K. Renz, K. Chitta, et al., Drivelm: driving with graph visual question answering, (2023).arXiv: 2312.14150\">."},{"key":"10.1016\/j.patrec.2026.03.023_bib0028","series-title":"IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"key":"10.1016\/j.patrec.2026.03.023_bib0029","doi-asserted-by":"crossref","first-page":"8906","DOI":"10.1109\/TMM.2023.3243616","article-title":"Dilateformer: multi-scale dilated transformer for visual recognition","volume":"25","author":"Jiao","year":"2023","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.patrec.2026.03.023_bib0030","series-title":"IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"1769","article-title":"Transvg: end-to-end visual grounding with transformers","author":"Deng","year":"2021"},{"key":"10.1016\/j.patrec.2026.03.023_bib0031","doi-asserted-by":"crossref","unstructured":"S. Liu, Z. Zeng, T. Ren, et al., Grounding dino: marrying dino with grounded pre-training for open-set object detection,(2023) arXiv: 2303.05499\">.","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"10.1016\/j.patrec.2026.03.023_bib0032","series-title":"IEEE International Conference on Computer Vision (ICCV)","first-page":"4694","article-title":"Zero-shot grounding of objects from natural language queries","author":"Sadhu","year":"2019"},{"key":"10.1016\/j.patrec.2026.03.023_bib0033","series-title":"IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"4683","article-title":"A fast and accurate one-stage approach to visual grounding","author":"Yang","year":"2019"},{"key":"10.1016\/j.patrec.2026.03.023_bib0034","series-title":"European Conference Computer Vision (ECCV)","first-page":"387","article-title":"Improving one-stage visual grounding by recursive sub-query construction","author":"Yang","year":"2020"}],"container-title":["Pattern Recognition Letters"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167865526001455?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167865526001455?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,23]],"date-time":"2026-05-23T01:06:48Z","timestamp":1779498408000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0167865526001455"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,7]]},"references-count":34,"alternative-id":["S0167865526001455"],"URL":"https:\/\/doi.org\/10.1016\/j.patrec.2026.03.023","relation":{},"ISSN":["0167-8655"],"issn-type":[{"value":"0167-8655","type":"print"}],"subject":[],"published":{"date-parts":[[2026,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"LM2CNet: Enhancing monocular 3D visual grounding with language guided multi-modality coupling network","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition Letters","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patrec.2026.03.023","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}]}}