{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T12:18:26Z","timestamp":1779365906748,"version":"3.53.0"},"reference-count":59,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62371310"],"award-info":[{"award-number":["62371310"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62376162"],"award-info":[{"award-number":["62376162"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100021171","name":"Basic and Applied Basic Research Foundation of Guangdong Province","doi-asserted-by":"publisher","award":["2023A1515011236"],"award-info":[{"award-number":["2023A1515011236"]}],"id":[{"id":"10.13039\/501100021171","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Stable Support Project of Shenzhen","award":["20231122122722001"],"award-info":[{"award-number":["20231122122722001"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Multimedia"],"published-print":{"date-parts":[[2025]]},"DOI":"10.1109\/tmm.2025.3599032","type":"journal-article","created":{"date-parts":[[2025,8,14]],"date-time":"2025-08-14T18:48:19Z","timestamp":1755197299000},"page":"7662-7674","source":"Crossref","is-referenced-by-count":2,"title":["Weakly-Supervised 3D Visual Grounding Based on Visual Language Alignment"],"prefix":"10.1109","volume":"27","author":[{"given":"Xiaoxu","family":"Xu","sequence":"first","affiliation":[{"name":"College of Computer Science and Software Engineering, Shenzhen University, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8701-7689","authenticated-orcid":false,"given":"Yitian","family":"Yuan","sequence":"additional","affiliation":[{"name":"Meituan Inc., Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6067-8188","authenticated-orcid":false,"given":"Qiudan","family":"Zhang","sequence":"additional","affiliation":[{"name":"College of Computer Science and Software Engineering, Shenzhen University, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0416-7719","authenticated-orcid":false,"given":"Wenhui","family":"Wu","sequence":"additional","affiliation":[{"name":"College of Electronics and Information Engineering, Shenzhen University, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zequn","family":"Jie","sequence":"additional","affiliation":[{"name":"Meituan Inc., Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7331-6132","authenticated-orcid":false,"given":"Lin","family":"Ma","sequence":"additional","affiliation":[{"name":"Meituan Inc., Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2948-6468","authenticated-orcid":false,"given":"Xu","family":"Wang","sequence":"additional","affiliation":[{"name":"College of Computer Science and Software Engineering, Shenzhen University, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_25"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_13"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00181"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19824-3_29"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00187"},{"key":"ref6","article-title":"Ham: Hierarchical attention model with high performance for 3D visual grounding","author":"Chen","year":"2022"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475397"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00292"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16253"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00370"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3190686"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00269"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_44"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00556"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3321501"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00251"},{"key":"ref17","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford","year":"2021"},{"key":"ref18","first-page":"5583","article-title":"VILT: Vision-and-language transformer without convolution or region supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kim","year":"2021"},{"key":"ref19","first-page":"32897","article-title":"VLMO: Unified vision-language pre-training with mixture-of-modality-experts","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Bao","year":"2022"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00041"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01222"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01028"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00473"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00808"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00928"},{"key":"ref27","first-page":"570","article-title":"A framework for multiple-instance learning","volume-title":"Adv. Neural Inf. Process. Syst.","author":"Maron","year":"1997"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_49"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00425"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00270"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.159"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.513"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00350"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2016.56"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00597"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01387"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01508"},{"key":"ref39","first-page":"37146","article-title":"Look around and refer: 2D synthetic semantics knowledge distillation for 3D visual grounding","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Bakr","year":"2022"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_24"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01596"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01843"},{"key":"ref43","first-page":"13","article-title":"VilBERT: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Lu","year":"2019"},{"key":"ref44","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Jia","year":"2021"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3237166"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00492"},{"key":"ref47","first-page":"5105","article-title":"PointNet : Deep hierarchical feature learning on point sets in a metric space","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Qi","year":"2017"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref49","first-page":"18661","article-title":"Supervised contrastive learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Khosla","year":"2020"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/iccv51070.2023.01397"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73116-7_12"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00272"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01320"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01807"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00085"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610443"},{"key":"ref57","first-page":"1046","article-title":"LanguageRefer: Spatial-language model for 3D visual grounding","volume-title":"Proc. Conf. Robot Learn.","author":"Roh","year":"2022"},{"key":"ref58","first-page":"8026","article-title":"PyTorch: An imperative style, high-performance deep learning library","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Paszke","year":"2019"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.5555\/3001460.3001507"}],"container-title":["IEEE Transactions on Multimedia"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6046\/10844992\/11125487.pdf?arnumber=11125487","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T17:25:30Z","timestamp":1761153930000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11125487\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":59,"URL":"https:\/\/doi.org\/10.1109\/tmm.2025.3599032","relation":{},"ISSN":["1520-9210","1941-0077"],"issn-type":[{"value":"1520-9210","type":"print"},{"value":"1941-0077","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]}}}