{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T10:35:34Z","timestamp":1763202934909,"version":"3.40.3"},"publisher-location":"Cham","reference-count":46,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031727535"},{"type":"electronic","value":"9783031727542"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72754-2_22","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T14:57:07Z","timestamp":1730300227000},"page":"387-404","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Multi-task Domain Adaptation for\u00a0Language Grounding with\u00a03D Objects"],"prefix":"10.1007","author":[{"given":"Penglei","family":"Sun","sequence":"first","affiliation":[]},{"given":"Yaoxian","family":"Song","sequence":"additional","affiliation":[]},{"given":"Xinglin","family":"Pan","sequence":"additional","affiliation":[]},{"given":"Peijie","family":"Dong","sequence":"additional","affiliation":[]},{"given":"Xiaofei","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Qiang","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Zhixu","family":"Li","sequence":"additional","affiliation":[]},{"given":"Tiefeng","family":"Li","sequence":"additional","affiliation":[]},{"given":"Xiaowen","family":"Chu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"22_CR1","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"422","DOI":"10.1007\/978-3-030-58452-8_25","volume-title":"Computer Vision \u2013 ECCV 2020","author":"P Achlioptas","year":"2020","unstructured":"Achlioptas, P., Abdelreheem, A., Xia, F., Elhoseiny, M., Guibas, L.: ReferIt3D: neural listeners for fine-grained 3D object identification in real-world scenes. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 422\u2013440. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_25"},{"doi-asserted-by":"crossref","unstructured":"Achlioptas, P., Fan, J., Hawkins, R., Goodman, N., Guibas, L.J.: ShapeGlot: learning language for shape differentiation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8938\u20138947 (2019)","key":"22_CR2","DOI":"10.1109\/ICCV.2019.00903"},{"unstructured":"Ahn, M., et\u00a0al.: Do as I can, not as I say: grounding language in robotic affordances. arXiv preprint arXiv:2204.01691 (2022)","key":"22_CR3"},{"doi-asserted-by":"crossref","unstructured":"Akula, A., Gella, S., Wang, K., Zhu, S.C., Reddy, S.: Mind the context: the impact of contextualization in neural module networks for grounding visual referring expressions. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 6398\u20136416 (2021)","key":"22_CR4","DOI":"10.18653\/v1\/2021.emnlp-main.516"},{"doi-asserted-by":"crossref","unstructured":"Bisk, Y., et\u00a0al.: Experience grounds language. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 8718\u20138735 (2020)","key":"22_CR5","DOI":"10.18653\/v1\/2020.emnlp-main.703"},{"unstructured":"Chang, A.X., et\u00a0al.: ShapeNet: an information-rich 3D model repository. arXiv preprint arXiv:1512.03012 (2015)","key":"22_CR6"},{"key":"22_CR7","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"202","DOI":"10.1007\/978-3-030-58565-5_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"DZ Chen","year":"2020","unstructured":"Chen, D.Z., Chang, A.X., Nie\u00dfner, M.: ScanRefer: 3D object localization in RGB-D scans using natural language. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12365, pp. 202\u2013221. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58565-5_13"},{"unstructured":"Chen, S., Guhur, P.L., Tapaswi, M., Schmid, C., Laptev, I.: Language conditioned spatial relation reasoning for 3D object grounding. In: Advances in Neural Information Processing Systems, vol. 35, pp. 20522\u201320535 (2022)","key":"22_CR8"},{"doi-asserted-by":"crossref","unstructured":"Corona, R., Zhu, S., Klein, D., Darrell, T.: Voxel-informed language grounding. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers), pp. 54\u201360 (2022)","key":"22_CR9","DOI":"10.18653\/v1\/2022.acl-short.7"},{"unstructured":"Csurka, G.: Domain adaptation for visual applications: a comprehensive survey. arXiv preprint arXiv:1702.05374 (2017)","key":"22_CR10"},{"doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","key":"22_CR11","DOI":"10.1109\/CVPR.2009.5206848"},{"doi-asserted-by":"crossref","unstructured":"Devillers, B., Choksi, B., Bielawski, R., Vanrullen, R.: Does language help generalization in vision models? In: Proceedings of the 25th Conference on Computational Natural Language Learning, pp. 171\u2013182 (2021)","key":"22_CR12","DOI":"10.18653\/v1\/2021.conll-1.13"},{"doi-asserted-by":"crossref","unstructured":"Diao, S., Xu, R., Su, H., Jiang, Y., Song, Y., Zhang, T.: Taming pre-trained language models with N-gram representations for low-resource domain adaptation. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pp. 3336\u20133349 (2021)","key":"22_CR13","DOI":"10.18653\/v1\/2021.acl-long.259"},{"doi-asserted-by":"publisher","unstructured":"Diao, S., Xu, T., Xu, R., Wang, J., Zhang, T.: Mixture-of-domain-adapters: decoupling and injecting domain knowledge to pre-trained language models\u2019 memories. In: Rogers, A., Boyd-Graber, J., Okazaki, N. (eds.) Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 5113\u20135129. Association for Computational Linguistics, Toronto, Canada, July 2023. https:\/\/doi.org\/10.18653\/v1\/2023.acl-long.280","key":"22_CR14","DOI":"10.18653\/v1\/2023.acl-long.280"},{"unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (2020)","key":"22_CR15"},{"doi-asserted-by":"publisher","unstructured":"Language models and linguistic theories beyond words. Nat. Mach. Intell. 5(7), 677\u2013678 (2023). https:\/\/doi.org\/10.1038\/s42256-023-00703-8","key":"22_CR16","DOI":"10.1038\/s42256-023-00703-8"},{"issue":"1","key":"22_CR17","doi-asserted-by":"publisher","first-page":"1427","DOI":"10.1038\/s41598-023-28588-y","volume":"13","author":"Y Gong","year":"2023","unstructured":"Gong, Y., Yue, Y., Ji, W., Zhou, G.: Cross-domain few-shot learning based on pseudo-Siamese neural network. Sci. Rep. 13(1), 1427 (2023)","journal-title":"Sci. Rep."},{"doi-asserted-by":"crossref","unstructured":"Guo, Z., et al.: ViewRefer: grasp the multi-view knowledge for 3D visual grounding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15372\u201315383 (2023)","key":"22_CR18","DOI":"10.1109\/ICCV51070.2023.01410"},{"doi-asserted-by":"crossref","unstructured":"Gururangan, S., et al.: Don\u2019t stop pretraining: adapt language models to domains and tasks. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 8342\u20138360 (2020)","key":"22_CR19","DOI":"10.18653\/v1\/2020.acl-main.740"},{"doi-asserted-by":"crossref","unstructured":"Hao, Y., Dong, L., Wei, F., Xu, K.: Visualizing and understanding the effectiveness of BERT. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pp. 4143\u20134152 (2019)","key":"22_CR20","DOI":"10.18653\/v1\/D19-1424"},{"issue":"1\u20133","key":"22_CR21","doi-asserted-by":"publisher","first-page":"335","DOI":"10.1016\/0167-2789(90)90087-6","volume":"42","author":"S Harnad","year":"1990","unstructured":"Harnad, S.: The symbol grounding problem. Physica D 42(1\u20133), 335\u2013346 (1990)","journal-title":"Physica D"},{"unstructured":"Hu, E.J., et\u00a0al.: LoRA: low-rank adaptation of large language models. In: International Conference on Learning Representations (2021)","key":"22_CR22"},{"doi-asserted-by":"crossref","unstructured":"Huang, S., Chen, Y., Jia, J., Wang, L.: Multi-view transformer for 3D visual grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15524\u201315533 (2022)","key":"22_CR23","DOI":"10.1109\/CVPR52688.2022.01508"},{"unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: ICML (2023)","key":"22_CR24"},{"unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)","key":"22_CR25"},{"unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: Advances in Neural Information Processing Systems, vol. 32 (2019)","key":"22_CR26"},{"doi-asserted-by":"crossref","unstructured":"Malik, B., Kashyap, A.R., Kan, M.Y., Poria, S.: UDApter-efficient domain adaptation using adapters. In: Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics, pp. 2241\u20132255 (2023)","key":"22_CR27","DOI":"10.18653\/v1\/2023.eacl-main.165"},{"unstructured":"Mitra, C., Anwar, A., Corona, R., Klein, D., Thomason, J.: Comparative multi-view language grounding. arXiv preprint arXiv:2311.06694 (2023)","key":"22_CR28"},{"doi-asserted-by":"crossref","unstructured":"Miyanishi, T., Azuma, D., Kurita, S., Kawanabe, M.: Cross3DVG: baseline and dataset for cross-dataset 3D visual grounding on different RGB-D scans. arXiv preprint arXiv:2305.13876 (2023)","key":"22_CR29","DOI":"10.1109\/3DV62453.2024.00033"},{"unstructured":"OpenAI: GPT-4 technical report (2023)","key":"22_CR30"},{"unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)","key":"22_CR31"},{"unstructured":"Radford, A., et\u00a0al.: Language models are unsupervised multitask learners. OpenAI Blog (2019)","key":"22_CR32"},{"unstructured":"Roh, J., Desingh, K., Farhadi, A., Fox, D.: LanguageRefer: spatial-language model for 3D visual grounding. In: Conference on Robot Learning, pp. 1046\u20131056. PMLR (2022)","key":"22_CR33"},{"doi-asserted-by":"crossref","unstructured":"Schumann, R., Riezler, S.: Analyzing generalization of vision and language navigation to unseen outdoor areas. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 7519\u20137532 (2022)","key":"22_CR34","DOI":"10.18653\/v1\/2022.acl-long.518"},{"doi-asserted-by":"crossref","unstructured":"Shrivastava, A., et al.: VISITRON: visual semantics-aligned interactively trained object-navigator. In: Findings of the Association for Computational Linguistics: ACL 2022, pp. 1984\u20131994 (2022)","key":"22_CR35","DOI":"10.18653\/v1\/2022.findings-acl.157"},{"unstructured":"Song, Y., Sun, P., Fang, P., Yang, L., Xiao, Y., Zhang, Y.: Human-in-the-loop robotic grasping using BERT scene representation. In: Proceedings of the 29th International Conference on Computational Linguistics, pp. 2992\u20133006 (2022)","key":"22_CR36"},{"unstructured":"Song, Y., et al.: Learning 6-DoF fine-grained grasp detection based on part affordance grounding (2024). https:\/\/arxiv.org\/abs\/2301.11564","key":"22_CR37"},{"unstructured":"Song, Y., et al.: Scene-driven multimodal knowledge graph construction for embodied AI (2023)","key":"22_CR38"},{"doi-asserted-by":"crossref","unstructured":"\u0160tef\u00e1nik, M., Novotn\u1ef3, V., Groverov\u00e1, N., Sojka, P.: AdaptOr: objective-centric adaptation framework for language models. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: System Demonstrations, pp. 261\u2013269 (2022)","key":"22_CR39","DOI":"10.18653\/v1\/2022.acl-demo.26"},{"doi-asserted-by":"crossref","unstructured":"Sun, W., Khan, H., Guenon\u00a0des Mesnards, N., Rubino, M., Arkoudas, K.: Unfreeze with care: space-efficient fine-tuning of semantic parsing models. In: Proceedings of the ACM Web Conference 2022, pp. 999\u20131007 (2022)","key":"22_CR40","DOI":"10.1145\/3485447.3511942"},{"doi-asserted-by":"crossref","unstructured":"Tai, W., Kung, H., Dong, X.L., Comiter, M., Kuo, C.F.: exBERT: extending pre-trained models with domain-specific vocabulary under constrained training resources. In: Findings of the Association for Computational Linguistics: EMNLP 2020, pp. 1433\u20131439 (2020)","key":"22_CR41","DOI":"10.18653\/v1\/2020.findings-emnlp.129"},{"unstructured":"Thomason, J., Shridhar, M., Bisk, Y., Paxton, C., Zettlemoyer, L.: Language grounding with 3D objects. In: Conference on Robot Learning, pp. 1691\u20131701. PMLR (2022)","key":"22_CR42"},{"doi-asserted-by":"crossref","unstructured":"Todorov, E., Erez, T., Tassa, Y.: MuJoCo: a physics engine for model-based control. In: 2012 IEEE\/RSJ International Conference on Intelligent Robots and Systems, pp. 5026\u20135033. IEEE (2012)","key":"22_CR43","DOI":"10.1109\/IROS.2012.6386109"},{"unstructured":"Wang, Z., Liang, J., He, R., Xu, N., Wang, Z., Tan, T.: Improving zero-shot generalization for CLIP with synthesized prompts. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3032\u20133042 (2023)","key":"22_CR44"},{"unstructured":"Yagubbayli, F., Wang, Y., Tonioni, A., Tombari, F.: LegoFormer: transformers for block-by-block multi-view 3D reconstruction. arXiv preprint arXiv:2106.12102 (2021)","key":"22_CR45"},{"doi-asserted-by":"crossref","unstructured":"Zhang, Y., Gong, Z., Chang, A.X.: Multi3DRefer: grounding text description to multiple 3D objects. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15225\u201315236 (2023)","key":"22_CR46","DOI":"10.1109\/ICCV51070.2023.01397"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72754-2_22","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T15:13:12Z","timestamp":1730301192000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72754-2_22"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031727535","9783031727542"],"references-count":46,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72754-2_22","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}