{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T01:41:23Z","timestamp":1743039683981,"version":"3.40.3"},"publisher-location":"Cham","reference-count":37,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031726484"},{"type":"electronic","value":"9783031726491"}],"license":[{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72649-1_22","type":"book-chapter","created":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T07:01:50Z","timestamp":1727593310000},"page":"385-401","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Bi-directional Contextual Attention for\u00a03D Dense Captioning"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-8038-3710","authenticated-orcid":false,"given":"Minjung","family":"Kim","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0006-0810-1027","authenticated-orcid":false,"given":"Hyung Suk","family":"Lim","sequence":"additional","affiliation":[]},{"given":"Soonyoung","family":"Lee","sequence":"additional","affiliation":[]},{"given":"Bumsoo","family":"Kim","sequence":"additional","affiliation":[]},{"given":"Gunhee","family":"Kim","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,30]]},"reference":[{"key":"22_CR1","doi-asserted-by":"publisher","unstructured":"Achlioptas, P., Abdelreheem, A., Xia, F., Elhoseiny, M., Guibas, L.: ReferIt3D: neural listeners for fine-grained 3D object identification in real-world scenes. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.M. (eds.) Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, 23\u201328 August 2020, Proceedings, Part I 16, pp. 422\u2013440. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_25","DOI":"10.1007\/978-3-030-58452-8_25"},{"key":"22_CR2","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"22_CR3","unstructured":"Banerjee, S., Lavie, A.: METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization, pp. 65\u201372 (2005)"},{"key":"22_CR4","doi-asserted-by":"crossref","unstructured":"Cai, D., Zhao, L., Zhang, J., Sheng, L., Xu, D.: 3DJCG: a unified framework for joint dense captioning and visual grounding on 3D point clouds. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16464\u201316473 (2022)","DOI":"10.1109\/CVPR52688.2022.01597"},{"key":"22_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"22_CR6","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"202","DOI":"10.1007\/978-3-030-58565-5_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"DZ Chen","year":"2020","unstructured":"Chen, D.Z., Chang, A.X., Nie\u00dfner, M.: ScanRefer: 3D object localization in RGB-D scans using natural language. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12365, pp. 202\u2013221. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58565-5_13"},{"key":"22_CR7","doi-asserted-by":"publisher","unstructured":"Chen, D.Z., Wu, Q., Nie\u00dfner, M., Chang, A.X.: D3Net: a unified speaker-listener architecture for 3D dense captioning and visual grounding. In: Avidan, S., Brostow, G., Cisse, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision \u2013 ECCV 2022. LNCS, vol. 13692, pp. 487\u2013505. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19824-3_29","DOI":"10.1007\/978-3-031-19824-3_29"},{"key":"22_CR8","doi-asserted-by":"crossref","unstructured":"Chen, J., Lei, B., Song, Q., Ying, H., Chen, D.Z., Wu, J.: A hierarchical graph network for 3D object detection on point clouds. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 392\u2013401 (2020)","DOI":"10.1109\/CVPR42600.2020.00047"},{"key":"22_CR9","doi-asserted-by":"crossref","unstructured":"Chen, S., Zhu, H., Chen, X., Lei, Y., Yu, G., Chen, T.: End-to-end 3D dense captioning with Vote2Cap-DETR. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11124\u201311133 (2023)","DOI":"10.1109\/CVPR52729.2023.01070"},{"key":"22_CR10","doi-asserted-by":"crossref","unstructured":"Chen, S., et al.: Vote2Cap-DETR++: decoupling localization and describing for end-to-end 3D dense captioning. IEEE Trans. Pattern Anal. Mach. Intell. (2024)","DOI":"10.1109\/TPAMI.2024.3387838"},{"key":"22_CR11","doi-asserted-by":"crossref","unstructured":"Chen, Z., Gholami, A., Nie\u00dfner, M., Chang, A.X.: Scan2Cap: context-aware dense captioning in RGB-D scans. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3193\u20133203 (2021)","DOI":"10.1109\/CVPR46437.2021.00321"},{"key":"22_CR12","doi-asserted-by":"crossref","unstructured":"Chen, Z., Hu, R., Chen, X., Nie\u00dfner, M., Chang, A.X.: Unit3D: a unified transformer for 3D dense captioning and visual grounding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 18109\u201318119 (2023)","DOI":"10.1109\/ICCV51070.2023.01660"},{"key":"22_CR13","unstructured":"Chin-Yew, L.: ROUGE: a package for automatic evaluation of summaries. In: Proceedings of the Workshop on Text Summarization Branches Out (2004)"},{"key":"22_CR14","doi-asserted-by":"crossref","unstructured":"Cornia, M., Stefanini, M., Baraldi, L., Cucchiara, R.: Meshed-memory transformer for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10578\u201310587 (2020)","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"22_CR15","doi-asserted-by":"crossref","unstructured":"Dai, A., Chang, A.X., Savva, M., Halber, M., Funkhouser, T., Nie\u00dfner, M.: ScanNet: richly-annotated 3D reconstructions of indoor scenes. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2017)","DOI":"10.1109\/CVPR.2017.261"},{"key":"22_CR16","doi-asserted-by":"crossref","unstructured":"Ghandi, T., Pourreza, H., Mahyar, H.: Deep learning approaches on image captioning: a review. arXiv preprint arXiv:2201.12944 (2022)","DOI":"10.1145\/3617592"},{"issue":"6","key":"22_CR17","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3295748","volume":"51","author":"MZ Hossain","year":"2019","unstructured":"Hossain, M.Z., Sohel, F., Shiratuddin, M.F., Laga, H.: A comprehensive survey of deep learning for image captioning. ACM Comput. Surv. (CsUR) 51(6), 1\u201336 (2019)","journal-title":"ACM Comput. Surv. (CsUR)"},{"key":"22_CR18","doi-asserted-by":"crossref","unstructured":"Jiang, L., Zhao, H., Shi, S., Liu, S., Fu, C.W., Jia, J.: PointGroup: dual-set point grouping for 3D instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4867\u20134876 (2020)","DOI":"10.1109\/CVPR42600.2020.00492"},{"key":"22_CR19","doi-asserted-by":"publisher","unstructured":"Jiao, Y., Chen, S., Jie, Z., Chen, J., Ma, L., Jiang, Y.G.: MORE: multi-order relation mining for dense captioning in 3D scenes. In: Avidan, S., Brostow, G., Cisse, M., Farinella, G.M., Hassner, T. (eds.) European Conference on Computer Vision, pp. 528\u2013545. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19833-5_31","DOI":"10.1007\/978-3-031-19833-5_31"},{"key":"22_CR20","doi-asserted-by":"crossref","unstructured":"Jin, Z., Hayat, M., Yang, Y., Guo, Y., Lei, Y.: Context-aware alignment and mutual masking for 3D-language pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10984\u201310994 (2023)","DOI":"10.1109\/CVPR52729.2023.01057"},{"issue":"1\u20132","key":"22_CR21","doi-asserted-by":"publisher","first-page":"83","DOI":"10.1002\/nav.3800020109","volume":"2","author":"HW Kuhn","year":"1955","unstructured":"Kuhn, H.W.: The Hungarian method for the assignment problem. Naval Res. Logistics Q. 2(1\u20132), 83\u201397 (1955)","journal-title":"Naval Res. Logistics Q."},{"key":"22_CR22","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"issue":"8","key":"22_CR23","doi-asserted-by":"publisher","first-page":"4867","DOI":"10.1109\/TVCG.2023.3279204","volume":"30","author":"A Mao","year":"2023","unstructured":"Mao, A., Yang, Z., Chen, W., Yi, R., Liu, Y.J.: Complete 3D relationships extraction modality alignment network for 3D dense captioning. IEEE Trans. Vis. Comput. Graph. 30(8), 4867\u20134880 (2023)","journal-title":"IEEE Trans. Vis. Comput. Graph."},{"key":"22_CR24","doi-asserted-by":"crossref","unstructured":"Misra, I., Girdhar, R., Joulin, A.: An end-to-end transformer model for 3D object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2906\u20132917 (2021)","DOI":"10.1109\/ICCV48922.2021.00290"},{"key":"22_CR25","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"22_CR26","unstructured":"Paszke, A., et al.: PyTorch: an imperative style, high-performance deep learning library. In: Advances in Neural Information Processing Systems, vol. 32, pp. 8024\u20138035. Curran Associates, Inc. (2019). http:\/\/papers.neurips.cc\/paper\/9015-pytorch-an-imperative-style-high-performance-deep-learning-library.pdf"},{"key":"22_CR27","doi-asserted-by":"crossref","unstructured":"Qi, C.R., Litany, O., He, K., Guibas, L.J.: Deep Hough voting for 3D object detection in point clouds. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9277\u20139286 (2019)","DOI":"10.1109\/ICCV.2019.00937"},{"key":"22_CR28","unstructured":"Qi, C.R., Yi, L., Su, H., Guibas, L.J.: PointNet++: deep hierarchical feature learning on point sets in a metric space. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"issue":"8","key":"22_CR29","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., et al.: Language models are unsupervised multitask learners. OpenAI Blog 1(8), 9 (2019)","journal-title":"OpenAI Blog"},{"key":"22_CR30","doi-asserted-by":"crossref","unstructured":"Rennie, S.J., Marcheret, E., Mroueh, Y., Ross, J., Goel, V.: Self-critical sequence training for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 7008\u20137024 (2017)","DOI":"10.1109\/CVPR.2017.131"},{"key":"22_CR31","first-page":"7537","volume":"33","author":"M Tancik","year":"2020","unstructured":"Tancik, M., et al.: Fourier features let networks learn high frequency functions in low dimensional domains. Adv. Neural. Inf. Process. Syst. 33, 7537\u20137547 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"22_CR32","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., Parikh, D.: CIDEr: consensus-based image description evaluation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"22_CR33","doi-asserted-by":"crossref","unstructured":"Wang, H., Zhang, C., Yu, J., Cai, W.: Spatiality-guided transformer for 3D dense captioning on point clouds. arXiv preprint arXiv:2204.10688 (2022)","DOI":"10.24963\/ijcai.2022\/194"},{"key":"22_CR34","unstructured":"Xu, K., et al.: Show, attend and tell: neural image caption generation with visual attention. In: International Conference on Machine Learning, pp. 2048\u20132057. PMLR (2015)"},{"key":"22_CR35","doi-asserted-by":"crossref","unstructured":"Yang, X., Tang, K., Zhang, H., Cai, J.: Auto-encoding scene graphs for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10685\u201310694 (2019)","DOI":"10.1109\/CVPR.2019.01094"},{"key":"22_CR36","doi-asserted-by":"crossref","unstructured":"Yuan, Z., et al.: X-Trans2Cap: cross-modal knowledge transfer using transformer for 3D dense captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8563\u20138573 (2022)","DOI":"10.1109\/CVPR52688.2022.00837"},{"key":"22_CR37","unstructured":"Zhong, Y., Xu, L., Luo, J., Ma, L.: Contextual modeling for 3D dense captioning on point clouds. arXiv preprint arXiv:2210.03925 (2022)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72649-1_22","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T07:10:26Z","timestamp":1727593826000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72649-1_22"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,30]]},"ISBN":["9783031726484","9783031726491"],"references-count":37,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72649-1_22","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,9,30]]},"assertion":[{"value":"30 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}