{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T07:26:55Z","timestamp":1740122815134,"version":"3.37.3"},"reference-count":36,"publisher":"Springer Science and Business Media LLC","issue":"33-34","license":[{"start":{"date-parts":[[2020,6,22]],"date-time":"2020-06-22T00:00:00Z","timestamp":1592784000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,6,22]],"date-time":"2020-06-22T00:00:00Z","timestamp":1592784000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2020,9]]},"DOI":"10.1007\/s11042-020-09128-6","type":"journal-article","created":{"date-parts":[[2020,6,22]],"date-time":"2020-06-22T17:03:51Z","timestamp":1592845431000},"page":"24429-24448","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Image captions: global-local and joint signals attention model (GL-JSAM)"],"prefix":"10.1007","volume":"79","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5265-6026","authenticated-orcid":false,"given":"Nuzhat","family":"Naqvi","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"ZhongFu","family":"Ye","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2020,6,22]]},"reference":[{"key":"9128_CR1","doi-asserted-by":"crossref","unstructured":"Anderson P, He X, Buehler C, Teney D, Johnson M, Gould S, Zhang L (2018) Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6077\u20136086","DOI":"10.1109\/CVPR.2018.00636"},{"key":"9128_CR2","unstructured":"Bahdanau D, Cho K, Bengio Y (2014) Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473"},{"key":"9128_CR3","doi-asserted-by":"crossref","unstructured":"Chen H, Ding G, Lin Z, Zhao S, Han J (2018) Show, observe and tell: attribute-driven attention model for image captioning. In: IJCAI, pp 606-612","DOI":"10.24963\/ijcai.2018\/84"},{"key":"9128_CR4","doi-asserted-by":"crossref","unstructured":"Devlin J, Cheng H, Fang H, Gupta S, Deng L, He X, Zweig G, Mitchell M (2015). Language models for image captioning: the quirks and what works. arXiv:1505.01809","DOI":"10.3115\/v1\/P15-2017"},{"key":"9128_CR5","doi-asserted-by":"crossref","unstructured":"Donahue J, Anne Hendricks L, Guadarrama S, Rohrbach M, Venugopalan S, Saenko K, Darrell T (2015) Long-term recurrent convolutional networks for visual recognition and description. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 2625-2634","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"9128_CR6","doi-asserted-by":"crossref","unstructured":"Fan DP, Wang W, Cheng MM, Shen J (2019) Shifting more attention to video salient object detection. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 8554\u20138564","DOI":"10.1109\/CVPR.2019.00875"},{"key":"9128_CR7","doi-asserted-by":"crossref","unstructured":"Farhadi A, Hejrati M, Sadeghi MA, Young P, Rashtchian C, Hockenmaier J, Forsyth D (2010) Every picture tells a story: generating sentences from images. In: European conference on computer vision, pp 15\u201329. Springer, Berlin, Heidelberg","DOI":"10.1007\/978-3-642-15561-1_2"},{"key":"9128_CR8","doi-asserted-by":"crossref","unstructured":"Gong Y, Wang L, Hodosh M, Hockenmaier J, Lazebnik S (2014) Improving image-sentence embeddings using large, weakly annotated photo collections. In: European conference on computer vision, pp 529\u2013545. Springer, Cham","DOI":"10.1007\/978-3-319-10593-2_35"},{"key":"9128_CR9","doi-asserted-by":"crossref","unstructured":"Gupta A, Mannem P (2012) From image annotation to image description. In: International conference on neural information processing, pp 196-204. Springer, Berlin, Heidelberg","DOI":"10.1007\/978-3-642-34500-5_24"},{"key":"9128_CR10","doi-asserted-by":"crossref","unstructured":"Karpathy A, Fei-Fei L (2015) Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3128\u20133137","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"9128_CR11","unstructured":"Kingma DP, Welling M (2013) Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114"},{"key":"9128_CR12","unstructured":"Kiros R, Salakhutdinov R, Zemel R (2014a) Multimodal neural language models. In: International conference on machine learning, pp 595\u2013603"},{"issue":"12","key":"9128_CR13","doi-asserted-by":"publisher","first-page":"2891","DOI":"10.1109\/TPAMI.2012.162","volume":"35","author":"G Kulkarni","year":"2013","unstructured":"Kulkarni G, Premraj V, Ordonez V, Dhar S, Li S, Choi Y, Berg TL (2013) Babytalk: understanding and generating simple image descriptions. IEEE Trans Pattern Anal Mach Intell 35(12):2891\u20132903","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"9128_CR14","unstructured":"Kuznetsova P, Ordonez V, Berg AC, Berg TL, Choi Y (2012). Collective generation of natural image descriptions. In: Proceedings of the 50th annual meeting of the Association for Computational Linguistics: Long papers-volume 1, pp 359\u2013368. Association for Computational Linguistics."},{"key":"9128_CR15","doi-asserted-by":"publisher","first-page":"351","DOI":"10.1162\/tacl_a_00188","volume":"2","author":"P Kuznetsova","year":"2014","unstructured":"Kuznetsova P, Ordonez V, Berg TL, Choi Y (2014) Treetalk: composition and compression of trees for image descriptions. Transactions of the Association for Computational Linguistics 2:351\u2013362","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"9128_CR16","doi-asserted-by":"crossref","unstructured":"Lavie A, & Agarwal A (2007, June). METEOR: An automatic metric for MT evaluation with high levels of correlation with human judgments. In Proceedings of the Second Workshop on Statistical Machine Translation (pp. 228\u2013231).","DOI":"10.3115\/1626355.1626389"},{"key":"9128_CR17","doi-asserted-by":"crossref","unstructured":"Li L, Tang S, Deng L, Zhang Y, Tian Q (2017) Image caption with global-local attention. In: Thirty-First AAAI Conference on Artificial Intelligence","DOI":"10.1609\/aaai.v31i1.11236"},{"key":"9128_CR18","doi-asserted-by":"crossref","unstructured":"Long C, Yang X, Xu C (2019) Cross-domain personalized image captioning. Multimedia Tools and Applications, 1\u201316.","DOI":"10.1007\/s11042-019-7441-7"},{"key":"9128_CR19","doi-asserted-by":"crossref","unstructured":"Lu J, Xiong C, Parikh D, Socher R (2017) Knowing when to look: adaptive attention via a visual sentinel for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 375\u2013383","DOI":"10.1109\/CVPR.2017.345"},{"key":"9128_CR20","unstructured":"Mitchell M, Han X, Dodge J, Mensch A, Goyal A, Berg A, Daum\u00e9 III, H (2012) Midge: generating image descriptions from computer vision detections. In: Proceedings of the 13th conference of the European chapter of the Association for Computational Linguistics, pp 747-756. Association for Computational Linguistics"},{"key":"9128_CR21","doi-asserted-by":"crossref","unstructured":"Papineni K, Roukos S, Ward T, Zhu W-J (2002) BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the 40 than nual meeting on association for computational linguistics. Association for Computational Linguistics, 311\u2013318","DOI":"10.3115\/1073083.1073135"},{"key":"9128_CR22","unstructured":"Ren S, He K, Girshick R, Sun J (2015) Faster R-CNN: towards real-time object detection with region proposal networks. In: Advances in neural information processing systems, pp 91\u201399"},{"key":"9128_CR23","doi-asserted-by":"crossref","unstructured":"Sun C, Gan C, Nevatia R (2015) Automatic concept discovery from parallel text and visual corpora. In: Proceedings of the IEEE international conference on computer vision, pp 2596-2604","DOI":"10.1109\/ICCV.2015.298"},{"key":"9128_CR24","unstructured":"Vanderwende L, Banko M, Menezes A (2004) Event-centric summary generation. Working notes of DUC, pp 127\u2013132"},{"key":"9128_CR25","doi-asserted-by":"crossref","unstructured":"Vedantam R, Lawrence Zitnick C, & Parikh D (2015) Cider: Consensus-based image description evaluation. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 4566\u20134575)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"9128_CR26","doi-asserted-by":"crossref","unstructured":"Vinyals O, Toshev A, Bengio S, Erhan D (2015) Show and tell: a neural image caption generator. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3156-3164","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"9128_CR27","doi-asserted-by":"crossref","unstructured":"Wang F, Jiang M, Qian C, Yang S, Li C, Zhang H, Tang X (2017) Residual attention network for image classification. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 3156\u20133164","DOI":"10.1109\/CVPR.2017.683"},{"issue":"2","key":"9128_CR28","doi-asserted-by":"publisher","first-page":"646","DOI":"10.3390\/s18020646","volume":"18","author":"L Wang","year":"2018","unstructured":"Wang L, Chu X, Zhang W, Wei Y, Sun W, Wu C (2018a) Social image captioning: exploring visual attention and user attention. Published online Sensors (Basel) 18(2):646","journal-title":"Published online Sensors (Basel)"},{"issue":"2","key":"9128_CR29","doi-asserted-by":"publisher","first-page":"1155","DOI":"10.1109\/TGRS.2018.2864987","volume":"57","author":"Q Wang","year":"2018","unstructured":"Wang Q, Liu S, Ssot J, Li X (2018b) Scene classification with recurrent attention of VHR remote sensing images. IEEE Trans Geosci Remote Sens 57(2):1155\u20131167","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"9128_CR30","doi-asserted-by":"crossref","unstructured":"Wang T, Hu H, He C (2019) Image caption with endogenous\u2013exogenous attention. Neural Process Lett:1\u201313","DOI":"10.1007\/s11063-019-09979-7"},{"key":"9128_CR31","unstructured":"Xu K, Ba J, Kiros R, Cho K, Courville A, Salakhudinov R, Bengio Y (2015) Show, attend and tell: Neural image caption generation with visual attention. In: International conference on machine learning, pp 2048-2057"},{"key":"9128_CR32","doi-asserted-by":"crossref","unstructured":"Yao T, Pan Y, Li Y, Mei T (2018) Exploring visual relationship for image captioning. In: Proceedings of the European conference on computer vision (ECCV), pp 684-699","DOI":"10.1007\/978-3-030-01264-9_42"},{"key":"9128_CR33","doi-asserted-by":"crossref","unstructured":"You Q, Jin H, Wang Z, Fang C, Luo J (2016) Image captioning with semantic attention. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 4651-4659","DOI":"10.1109\/CVPR.2016.503"},{"issue":"5","key":"9128_CR34","doi-asserted-by":"publisher","first-page":"2019","DOI":"10.1109\/TIP.2014.2311377","volume":"23","author":"J Yu","year":"2014","unstructured":"Yu J, Rui Y, Tao D (2014) Click prediction for web image re-ranking using multimodal sparse coding. IEEE Trans Image Process 23(5):2019\u20132032","journal-title":"IEEE Trans Image Process"},{"issue":"7","key":"9128_CR35","doi-asserted-by":"publisher","first-page":"3423","DOI":"10.1109\/TIP.2019.2896952","volume":"28","author":"Y Yuan","year":"2019","unstructured":"Yuan Y, Xiong Z, Wang Q (2019) VSSA-NET: vertical spatial sequence attention network for traffic sign detection. IEEE Trans Image Process 28(7):3423\u20133434","journal-title":"IEEE Trans Image Process"},{"key":"9128_CR36","doi-asserted-by":"crossref","unstructured":"Zhou Y, Sun Y, Honavar V (2019) Improving image captioning by leveraging knowledge graphs. arXiv preprint arXiv: 1901. 08942","DOI":"10.1109\/WACV.2019.00036"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-020-09128-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-020-09128-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-020-09128-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,29]],"date-time":"2022-10-29T20:14:47Z","timestamp":1667074487000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-020-09128-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,6,22]]},"references-count":36,"journal-issue":{"issue":"33-34","published-print":{"date-parts":[[2020,9]]}},"alternative-id":["9128"],"URL":"https:\/\/doi.org\/10.1007\/s11042-020-09128-6","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"type":"print","value":"1380-7501"},{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2020,6,22]]},"assertion":[{"value":"30 May 2019","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 May 2020","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 May 2020","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 June 2020","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}