{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T22:26:30Z","timestamp":1757629590117,"version":"3.44.0"},"publisher-location":"Cham","reference-count":26,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032045454","type":"print"},{"value":"9783032045461","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T00:00:00Z","timestamp":1757548800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T00:00:00Z","timestamp":1757548800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-04546-1_3","type":"book-chapter","created":{"date-parts":[[2025,9,10]],"date-time":"2025-09-10T14:52:56Z","timestamp":1757515976000},"page":"26-37","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["HCNQA: Enhancing 3D VQA with\u00a0Hierarchical Concentration Narrowing Supervision"],"prefix":"10.1007","author":[{"given":"Shengli","family":"Zhou","sequence":"first","affiliation":[]},{"given":"Jianuo","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Qilin","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Fangjing","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Yanfu","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Feng","family":"Zheng","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,9,11]]},"reference":[{"key":"3_CR1","doi-asserted-by":"crossref","unstructured":"Azuma, D., Miyanishi, T., Kurita, S., Kawanabe, M.: ScanQA: 3D question answering for spatial scene understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 19129\u201319139 (2022)","DOI":"10.1109\/CVPR52688.2022.01854"},{"key":"3_CR2","doi-asserted-by":"crossref","unstructured":"Cai, D., Zhao, L., Zhang, J., Sheng, L., Xu, D.: 3DJCG: a unified framework for joint dense captioning and visual grounding on 3D point clouds. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 16464\u201316473 (2022)","DOI":"10.1109\/CVPR52688.2022.01597"},{"key":"3_CR3","unstructured":"Chen, S., Tapaswi, M., Guhur, P.L., Schmid, C., Laptev, I.: Language conditioned spatial relation reasoning for 3D object grounding. In: NeurIPS (2022)"},{"key":"3_CR4","doi-asserted-by":"crossref","unstructured":"Chen, S., et al.: LL3DA: visual interactive instruction tuning for omni-3D understanding reasoning and planning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 26428\u201326438 (2024)","DOI":"10.1109\/CVPR52733.2024.02496"},{"key":"3_CR5","doi-asserted-by":"crossref","unstructured":"Dancette, C., Cad\u00e8ne, R., Teney, D., Cord, M.: Beyond question-based biases: assessing multimodal shortcut learning in visual question answering. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 1574\u20131583 (2021)","DOI":"10.1109\/ICCV48922.2021.00160"},{"key":"3_CR6","unstructured":"DeepSeek-AI: DeepSeek-R1: incentivizing reasoning capability in LLMs via reinforcement learning (2025). https:\/\/arxiv.org\/abs\/2501.12948"},{"key":"3_CR7","unstructured":"Delitzas, A., et al.: Multi-CLIP: contrastive vision-language pre-training for question answering tasks in 3D scenes. ArXiv abs\/2306.02329 (2023). https:\/\/api.semanticscholar.org\/CorpusID:259076122"},{"key":"3_CR8","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: North American Chapter of the Association for Computational Linguistics (2019)"},{"key":"3_CR9","unstructured":"Hong, Y., Zhen, H., Chen, P., Zheng, S., Du, Y., Chen, Z., Gan, C.: 3D-LLM: injecting the 3D world into large language models. In: NeurIPS (2023)"},{"key":"3_CR10","unstructured":"Huang, J., et al.: An embodied generalist agent in 3D world. In: Salakhutdinov, R., et al. (eds.) Proceedings of the 41st International Conference on Machine Learning. In: Proceedings of Machine Learning Research, vol.\u00a0235, pp. 20413\u201320451. PMLR (2024). https:\/\/proceedings.mlr.press\/v235\/huang24ae.html"},{"key":"3_CR11","doi-asserted-by":"crossref","unstructured":"Huang, S., Chen, Y., Jia, J., Wang, L.: Multi-view transformer for 3D visual grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 15524\u201315533 (2022)","DOI":"10.1109\/CVPR52688.2022.01508"},{"key":"3_CR12","doi-asserted-by":"crossref","unstructured":"Jin, Z., Hayat, M., Yang, Y., Guo, Y., Lei, Y.: Context-aware alignment and mutual masking for 3D-language pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10984\u201310994 (2023)","DOI":"10.1109\/CVPR52729.2023.01057"},{"key":"3_CR13","unstructured":"Longpre, S., et al.: The flan collection: designing data and methods for effective instruction tuning (2023). https:\/\/arxiv.org\/abs\/2301.13688"},{"key":"3_CR14","unstructured":"Ma, X., Yong, S., Zheng, Z., Li, Q., Liang, Y., Zhu, S.C., Huang, S.: SQA3D: situated question answering in 3D scenes. In: International Conference on Learning Representations (2023). https:\/\/openreview.net\/forum?id=IDJx97BC38"},{"key":"3_CR15","doi-asserted-by":"publisher","unstructured":"Mo, W., Liu, Y.: Bridging the gap between 2D and 3D visual question answering: a fusion approach for 3D VQA. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 38, no. 5, pp. 4261\u20134268 (2024). https:\/\/doi.org\/10.1609\/aaai.v38i5.28222","DOI":"10.1609\/aaai.v38i5.28222"},{"key":"3_CR16","unstructured":"OpenAI: Learning to reason with LLMs (2024). https:\/\/openai.com\/index\/learning-to-reason-with-llms\/. Accessed 26 July 2024"},{"key":"3_CR17","doi-asserted-by":"crossref","unstructured":"Parelli, M., et al.: CLIP-guided vision-language pre-training for question answering in 3D scenes. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops, pp. 5607\u20135612 (2023)","DOI":"10.1109\/CVPRW59228.2023.00593"},{"key":"3_CR18","unstructured":"Qi, C.R., Yi, L., Su, H., Guibas, L.J.: PointNet++: deep hierarchical feature learning on point sets in a metric space. In: Proceedings of the 31st International Conference on Neural Information Processing Systems, NIPS 2017, pp. 5105\u20135114. Curran Associates Inc., Red Hook (2017)"},{"key":"3_CR19","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: Meila, M., Zhang, T. (eds.) Proceedings of the 38th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0139, pp. 8748\u20138763. PMLR (2021). https:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"key":"3_CR20","doi-asserted-by":"crossref","unstructured":"Redmon, J., Divvala, S., Girshick, R., Farhadi, A.: You only look once: unified, real-time object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.91"},{"key":"3_CR21","doi-asserted-by":"crossref","unstructured":"Schult, J., Engelmann, F., Hermans, A., Litany, O., Tang, S., Leibe, B.: Mask3D: mask transformer for 3D semantic instance segmentation. In: International Conference on Robotics and Automation (ICRA) (2023)","DOI":"10.1109\/ICRA48891.2023.10160590"},{"key":"3_CR22","unstructured":"Sourulahti, S., Janssen, C.P., Jokinen, J.P.: Modeling rational adaptation of visual search to hierarchical structures (2024). https:\/\/arxiv.org\/abs\/2409.08967"},{"key":"3_CR23","unstructured":"Wang, Y., et al.: Thoughts are all over the place: on the underthinking of o1-like LLMs (2025). https:\/\/arxiv.org\/abs\/2501.18585"},{"key":"3_CR24","doi-asserted-by":"publisher","unstructured":"Ye, K., Kovashka, A.: A case study of the shortcut effects in visual commonsense reasoning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 35, no. 4, pp. 3181\u20133189 (2021). https:\/\/doi.org\/10.1609\/aaai.v35i4.16428","DOI":"10.1609\/aaai.v35i4.16428"},{"key":"3_CR25","doi-asserted-by":"crossref","unstructured":"Yu, Z., Yu, J., Cui, Y., Tao, D., Tian, Q.: Deep modular co-attention networks for visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2019)","DOI":"10.1109\/CVPR.2019.00644"},{"key":"3_CR26","doi-asserted-by":"crossref","unstructured":"Zhu, Z., Ma, X., Chen, Y., Deng, Z., Huang, S., Li, Q.: 3D-VisTA: pre-trained transformer for 3D vision and text alignment. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 2911\u20132921 (2023)","DOI":"10.1109\/ICCV51070.2023.00272"}],"container-title":["Lecture Notes in Computer Science","Artificial Neural Networks and Machine Learning \u2013 ICANN 2025"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-04546-1_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,10]],"date-time":"2025-09-10T14:53:08Z","timestamp":1757515988000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-04546-1_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,11]]},"ISBN":["9783032045454","9783032045461"],"references-count":26,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-04546-1_3","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9,11]]},"assertion":[{"value":"11 September 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICANN","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Artificial Neural Networks","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kaunas","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Lithuania","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"12 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"34","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icann2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/e-nns.org\/icann2025\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}