{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,13]],"date-time":"2024-09-13T18:41:14Z","timestamp":1726252874227},"reference-count":40,"publisher":"Springer Science and Business Media LLC","issue":"11","license":[{"start":{"date-parts":[[2024,8,14]],"date-time":"2024-08-14T00:00:00Z","timestamp":1723593600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,8,14]],"date-time":"2024-08-14T00:00:00Z","timestamp":1723593600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SIViP"],"published-print":{"date-parts":[[2024,11]]},"DOI":"10.1007\/s11760-024-03449-x","type":"journal-article","created":{"date-parts":[[2024,8,14]],"date-time":"2024-08-14T13:02:32Z","timestamp":1723640552000},"page":"8031-8048","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["A rich RGBD images captioning for scene understanding"],"prefix":"10.1007","volume":"18","author":[{"given":"Khadidja","family":"Delloul","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Slimane","family":"Larabi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,8,14]]},"reference":[{"key":"3449_CR1","doi-asserted-by":"publisher","unstructured":"Zatout, C., Larabi, S.: A novel output device for visually impaired and blind people\u2019s aid systems. In: 1st International Conference on Communications, Control Systems and Signal Processing (CCSSP), El Oued, Algeria, pp. 119\u2013124 (2020). https:\/\/doi.org\/10.1109\/CCSSP49278.2020.9151820","DOI":"10.1109\/CCSSP49278.2020.9151820"},{"key":"3449_CR2","doi-asserted-by":"publisher","first-page":"2691","DOI":"10.1007\/s00371-021-02147-w","volume":"38","author":"C Zatout","year":"2022","unstructured":"Zatout, C., Larabi, S.: Semantic scene synthesis: application to assistive systems. Vis. Comput. 38, 2691\u20132705 (2022). https:\/\/doi.org\/10.1007\/s00371-021-02147-w","journal-title":"Vis. Comput."},{"key":"3449_CR3","unstructured":"Be My Eyes. https:\/\/www.bemyeyes.com\/"},{"key":"3449_CR4","unstructured":"Microsoft Seeing AI. https:\/\/www.microsoft.com\/en-us\/ai\/seeing-ai"},{"key":"3449_CR5","unstructured":"MindsEye Radio, Translating Vision Into Audio. https:\/\/mindseyeradio.org\/. (Accessed on 9 July) (2023)"},{"key":"3449_CR6","doi-asserted-by":"publisher","unstructured":"Benhamida, L., Delloul, K., Larabi, S.: TS-RGBD Dataset: A Novel Dataset for Theatre Scenes Description for People with Visual Impairments. https:\/\/doi.org\/10.48550\/arXiv.2308.01035. Preprint (2023)","DOI":"10.48550\/arXiv.2308.01035"},{"key":"3449_CR7","doi-asserted-by":"publisher","unstructured":"Delloul, K., Larabi, S.: Egocentric scene description for the blind and visually impaired. In: 5th International Symposium on Informatics and Its Applications (ISIA), M\u2019sila, Algeria, pp. 1\u20136 (2022). https:\/\/doi.org\/10.1109\/ISIA55826.2022.9993531","DOI":"10.1109\/ISIA55826.2022.9993531"},{"key":"3449_CR8","doi-asserted-by":"publisher","first-page":"129","DOI":"10.1016\/j.neunet.2022.01.011","volume":"148","author":"T Xian","year":"2022","unstructured":"Xian, T., Li, Z., Zhang, C., Ma, H.: Dual global enhanced transformer for image captioning. Neural Netw. 148, 129\u2013141 (2022). https:\/\/doi.org\/10.1016\/j.neunet.2022.01.011","journal-title":"Neural Netw."},{"key":"3449_CR9","doi-asserted-by":"publisher","DOI":"10.1016\/j.displa.2022.102238","volume":"73","author":"W Jiang","year":"2022","unstructured":"Jiang, W., Li, Q., Zhan, K., Fang, Y., Shen, F.: Hybrid attention network for image captioning. Displays 73, 102238 (2022). https:\/\/doi.org\/10.1016\/j.displa.2022.102238","journal-title":"Displays"},{"key":"3449_CR10","unstructured":"Wang, J., Yang, Z., Hu, X., Li, L., Lin, K., Gan, Z., Liu, Z., Liu, C., Wang, L.: Git: A generative image-to-text transformer for vision and language (2022)"},{"key":"3449_CR11","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2024.123847","volume":"250","author":"Y Xiaobao","year":"2024","unstructured":"Xiaobao, Y., Yang, Y., Wu, J., et al.: Ca-captioner: a novel concentrated attention for image captioning. Expert Syst. Appl. 250, 123847 (2024). https:\/\/doi.org\/10.1016\/j.eswa.2024.123847","journal-title":"Expert Syst. Appl."},{"key":"3449_CR12","doi-asserted-by":"publisher","first-page":"122955","DOI":"10.1016\/j.eswa.2023.122955","volume":"243","author":"L Chen","year":"2024","unstructured":"Chen, L., Li, K.: Dual-adaptive interactive transformer with textual and visual context for image captioning. Expert Syst. Appl. 243, 122955 (2024)","journal-title":"Expert Syst. Appl."},{"key":"3449_CR13","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2023.111056","volume":"282","author":"R Saeidimesineh","year":"2023","unstructured":"Saeidimesineh, R., Adibi, P., Karshenas, H., Darvishy, A.: Parallel encoder\u2013decoder framework for image captioning. Knowl. Based Syst. 282, 111056 (2023). https:\/\/doi.org\/10.1016\/j.knosys.2023.111056","journal-title":"Knowl. Based Syst."},{"key":"3449_CR14","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.120698","volume":"231","author":"J Jia","year":"2023","unstructured":"Jia, J., Ding, X., Pang, S., Gao, X., Xin, X., Hu, R., Nie, J.: Image captioning based on scene graphs: a survey. Expert Syst. Appl. 231, 120698 (2023). https:\/\/doi.org\/10.1016\/j.eswa.2023.120698","journal-title":"Expert Syst. Appl."},{"key":"3449_CR15","doi-asserted-by":"publisher","DOI":"10.1016\/j.displa.2023.102490","volume":"79","author":"R Yang","year":"2023","unstructured":"Yang, R., Cui, X., Qin, Q., Deng, Z., Lan, R., Luo, X.: Fast rf-uic: a fast unsupervised image captioning model. Displays 79, 102490 (2023)","journal-title":"Displays"},{"key":"3449_CR16","doi-asserted-by":"publisher","unstructured":"Shambharkar, P.G., Kumari, P., Yadav, P., Kumar, R.: Generating caption for image using beam search and analyzation with unsupervised image captioning algorithm. In: 5th International Conference on Intelligent Computing and Control Systems (ICICCS), Madurai, India, pp. 857\u2013864 (2021). https:\/\/doi.org\/10.1109\/ICICCS51141.2021.9432245","DOI":"10.1109\/ICICCS51141.2021.9432245"},{"key":"3449_CR17","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2024.111433","volume":"287","author":"C Cai","year":"2024","unstructured":"Cai, C., Wang, S., Yap, K., Wang, Y.: Top-down framework for weakly-supervised grounded image captioning. Knowl. Based Syst. 287, 111433 (2024)","journal-title":"Knowl. Based Syst."},{"key":"3449_CR18","doi-asserted-by":"publisher","first-page":"4300","DOI":"10.1007\/s10489-024-05389-y","volume":"54","author":"S Du","year":"2024","unstructured":"Du, S., Zhu, H., Lin, G., et al.: Weakly supervised grounded image captioning with semantic matching. Appl. Intell. 54, 4300\u20134318 (2024)","journal-title":"Appl. Intell."},{"key":"3449_CR19","doi-asserted-by":"crossref","unstructured":"Boroujerdi, A.S., Khanian, M., Breuss, M.: Deep interactive region segmentation and captioning (2017)","DOI":"10.1109\/SITIS.2017.27"},{"key":"3449_CR20","doi-asserted-by":"publisher","unstructured":"Patankar, R., Sethi, H., Sadhukha, A., Banjade, N., Mathur, A.: Image captioning with audio reinforcement using rnn and cnn. In: International Conference on Sustainable Computing and Smart Systems (ICSCSS), Coimbatore, India, pp. 591\u2013596 (2023). https:\/\/doi.org\/10.1109\/ICSCSS57650.2023.10169692","DOI":"10.1109\/ICSCSS57650.2023.10169692"},{"key":"3449_CR21","doi-asserted-by":"publisher","first-page":"92","DOI":"10.1016\/j.neucom.2020.02.041","volume":"396","author":"L Ruifan","year":"2020","unstructured":"Ruifan, L., Haoyu, L., Yihui, S., Fangxiang, F., Xiaojie, W.: Dual-cnn: a convolutional language decoder for paragraph image captioning. Neurocomputing 396, 92\u2013101 (2020). https:\/\/doi.org\/10.1016\/j.neucom.2020.02.041","journal-title":"Neurocomputing"},{"key":"3449_CR22","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2020.106730","volume":"214","author":"X Chunpu","year":"2020","unstructured":"Chunpu, X., Min, Y., Xiang, A., Ying, S., Ruifeng, X., Jinwen, T.: Retrieval-enhanced adversarial training with dynamic memory-augmented attention for image paragraph captioning. Knowl. Based Syst. 214, 106730 (2020). https:\/\/doi.org\/10.1016\/j.knosys.2020.106730","journal-title":"Knowl. Based Syst."},{"key":"3449_CR23","doi-asserted-by":"publisher","first-page":"710","DOI":"10.1109\/tpami.2019.2909864","volume":"44","author":"ZJ Zha","year":"2022","unstructured":"Zha, Z.J., Liu, D., Zhang, H., Zhang, Y., Wu, F.: Context-aware visual policy network for fine-grained image captioning. IEEE Trans. Pattern Anal. Mach. Intell. 44, 710\u2013722 (2022). https:\/\/doi.org\/10.1109\/tpami.2019.2909864","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3449_CR24","doi-asserted-by":"publisher","unstructured":"Kanani, C.S., Saha, S., Bhattacharyya, P.: Improving diversity and reducing redundancy in paragraph captions. In: International Joint Conference on Neural Networks (IJCNN), Glasgow, UK, pp. 1\u20138 (2020). https:\/\/doi.org\/10.1109\/IJCNN48605.2020.9206644","DOI":"10.1109\/IJCNN48605.2020.9206644"},{"key":"3449_CR25","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2024.111401","volume":"286","author":"T Tang","year":"2024","unstructured":"Tang, T., Chen, J., Huang, Y., et al.: Image paragraph captioning with topic clustering and topic shift prediction. Knowl. Based Syst. 286, 111401 (2024). https:\/\/doi.org\/10.1016\/j.knosys.2024.111401","journal-title":"Knowl. Based Syst."},{"issue":"9","key":"3449_CR26","doi-asserted-by":"publisher","first-page":"2307","DOI":"10.1109\/TMM.2019.2954750","volume":"22","author":"W Che","year":"2020","unstructured":"Che, W., Fan, X., Xiong, R., Zhao, D.: Visual relationship embedding network for image paragraph generation. IEEE Trans. Multimed. 22(9), 2307\u20132320 (2020). https:\/\/doi.org\/10.1109\/TMM.2019.2954750","journal-title":"IEEE Trans. Multimed."},{"key":"3449_CR27","doi-asserted-by":"publisher","unstructured":"Long, Y., et al.: Capdet: unifying dense captioning and open-world detection pretraining. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), Vancouver, BC, Canada, pp. 15233\u201315243 (2023). https:\/\/doi.org\/10.1109\/CVPR52729.2023.01462","DOI":"10.1109\/CVPR52729.2023.01462"},{"key":"3449_CR28","doi-asserted-by":"crossref","unstructured":"Johnson, J., Karpathy, A., Fei-Fei, L.: Densecap: Fully convolutional localization networks for dense captioning (2015)","DOI":"10.1109\/CVPR.2016.494"},{"key":"3449_CR29","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., Zhu, Y., Groth, O., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. 123, 32\u201373 (2017). https:\/\/doi.org\/10.1007\/s11263-016-0981-7","journal-title":"Int. J. Comput. Vis."},{"key":"3449_CR30","doi-asserted-by":"crossref","unstructured":"Lin, T., Maire, M., Belongie, S., Bourdev, L., Girshick, R., et al.: Microsoft coco: common objects in context (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"3449_CR31","unstructured":"Wu, Y., Kirillov, A., Massa, F., Lo, W.Y., Girshick, R.: Detectron2. https:\/\/github.com\/facebookresearch\/detectron2 (2019)"},{"key":"3449_CR32","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition (2014)"},{"key":"3449_CR33","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition (2015)","DOI":"10.1109\/CVPR.2016.90"},{"key":"3449_CR34","doi-asserted-by":"publisher","unstructured":"Lin, T., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., Belongie, S.: Feature Pyramid Networks for Object Detection (2017). https:\/\/doi.org\/10.48550\/arXiv.1612.03144","DOI":"10.48550\/arXiv.1612.03144"},{"key":"3449_CR35","unstructured":"DenseCap in Pytorch. https:\/\/github.com\/soloist97\/densecap-pytorch. Accessed on 3 August (2023)"},{"key":"3449_CR36","unstructured":"Dataset: RGB-D Theatre Scenes Dataset. https:\/\/github.com\/khadidja-delloul\/RGB-D-Theatre-Scenes-Dataset. Accessed on 3 August (2023)"},{"key":"3449_CR37","unstructured":"LabelMe. Image Polygonal Annotation with Python. https:\/\/github.com\/wkentaro\/labelme. accessed on 3 August (2023)"},{"key":"3449_CR38","doi-asserted-by":"crossref","unstructured":"Hu, J., Huang, L., Ren, T., Zhang, S., Ji, R., Cao, L.: You only segment once: towards real-time panoptic segmentation (2023)","DOI":"10.1109\/CVPR52729.2023.01709"},{"key":"3449_CR39","doi-asserted-by":"publisher","unstructured":"Jain, J., Li, J., Chiu, M., Hassani, A., Orlov, N., Shi, H.: Oneformer: one transformer to rule universal image segmentation. In: CVPR (2023). https:\/\/doi.org\/10.48550\/arXiv.2211.06220","DOI":"10.48550\/arXiv.2211.06220"},{"key":"3449_CR40","doi-asserted-by":"publisher","unstructured":"Benhamida, L., Larabi, S.: Human action recognition and coding based on skeleton data for visually impaired and blind people aid system. In: First International Conference on Computer Communications and Intelligent Systems (I3CIS), Jijel, Algeria, pp. 49\u201354 (2022). https:\/\/doi.org\/10.1109\/I3CIS56626.2022.10075662","DOI":"10.1109\/I3CIS56626.2022.10075662"}],"container-title":["Signal, Image and Video Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-024-03449-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11760-024-03449-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-024-03449-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,13]],"date-time":"2024-09-13T17:51:05Z","timestamp":1726249865000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11760-024-03449-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,14]]},"references-count":40,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2024,11]]}},"alternative-id":["3449"],"URL":"https:\/\/doi.org\/10.1007\/s11760-024-03449-x","relation":{},"ISSN":["1863-1703","1863-1711"],"issn-type":[{"type":"print","value":"1863-1703"},{"type":"electronic","value":"1863-1711"}],"subject":[],"published":{"date-parts":[[2024,8,14]]},"assertion":[{"value":"12 May 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 June 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 July 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 August 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors report there are no conflict of interest to declare.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}