{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T01:45:20Z","timestamp":1773193520007,"version":"3.50.1"},"reference-count":54,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2022,11,14]],"date-time":"2022-11-14T00:00:00Z","timestamp":1668384000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,11,14]],"date-time":"2022-11-14T00:00:00Z","timestamp":1668384000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2023,4]]},"DOI":"10.1007\/s00530-022-01012-7","type":"journal-article","created":{"date-parts":[[2022,11,14]],"date-time":"2022-11-14T06:03:56Z","timestamp":1668405836000},"page":"797-809","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["Guide and interact: scene-graph based generation and control of video captions"],"prefix":"10.1007","volume":"29","author":[{"given":"Xuyang","family":"Lu","sequence":"first","affiliation":[]},{"given":"Yang","family":"Gao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,11,14]]},"reference":[{"key":"1012_CR1","doi-asserted-by":"crossref","unstructured":"Wu, Z., Galley, M., Brockett, C., Zhang, Y., Gao, X., Quirk, C., Koncel-Kedziorski, R., Gao, J., Hajishirzi, H., Ostendorf, M., et al.: A controllable model of grounded response generation. arXiv preprint arXiv:2005.00613 (2020)","DOI":"10.1609\/aaai.v35i16.17658"},{"key":"1012_CR2","doi-asserted-by":"crossref","unstructured":"Maynez, J., Narayan, S., Bohnet, B., McDonald, R.: On faithfulness and factuality in abstractive summarization. arXiv preprint arXiv:2005.00661 (2020)","DOI":"10.18653\/v1\/2020.acl-main.173"},{"key":"1012_CR3","doi-asserted-by":"crossref","unstructured":"Durmus, E., He, H., Diab, M.: Feqa: A question answering evaluation framework for faithfulness assessment in abstractive summarization. arXiv preprint arXiv:2005.03754 (2020)","DOI":"10.18653\/v1\/2020.acl-main.454"},{"key":"1012_CR4","unstructured":"M\u00fcller, M., Rios, A., Sennrich, R.: Domain robustness in neural machine translation. arXiv preprint arXiv:1911.03109 (2019)"},{"key":"1012_CR5","doi-asserted-by":"crossref","unstructured":"Zhou, L., Zhou, Y., Corso, J.J., Socher, R., Xiong, C.: End-to-end dense video captioning with masked transformer. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 8739\u20138748 (2018)","DOI":"10.1109\/CVPR.2018.00911"},{"key":"1012_CR6","doi-asserted-by":"crossref","unstructured":"Fang, K., Zhou, L., Jin, C., Zhang, Y., Weng, K., Zhang, T., Fan, W.: Fully convolutional video captioning with coarse-to-fine and inherited attention. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 33, pp. 8271\u20138278 (2019)","DOI":"10.1609\/aaai.v33i01.33018271"},{"key":"1012_CR7","doi-asserted-by":"crossref","unstructured":"Pei, W., Zhang, J., Wang, X., Ke, L., Shen, X., Tai, Y.-W.: Memory-attended recurrent network for video captioning (2019)","DOI":"10.1109\/CVPR.2019.00854"},{"key":"1012_CR8","doi-asserted-by":"crossref","unstructured":"Lei, J., Wang, L., Shen, Y., Yu, D., Berg, T.L., Bansal, M.: Mart: Memory-augmented recurrent transformer for coherent video paragraph captioning. arXiv preprint arXiv:2005.05402 (2020)","DOI":"10.18653\/v1\/2020.acl-main.233"},{"key":"1012_CR9","doi-asserted-by":"crossref","unstructured":"Zhou, L., Kalantidis, Y., Chen, X., Corso, J.J., Rohrbach, M.: Grounded video description. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6578\u20136587 (2019)","DOI":"10.1109\/CVPR.2019.00674"},{"key":"1012_CR10","doi-asserted-by":"crossref","unstructured":"Zhang, J., Peng, Y.: Object-aware aggregation with bidirectional temporal graph for video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8327\u20138336 (2019)","DOI":"10.1109\/CVPR.2019.00852"},{"key":"1012_CR11","doi-asserted-by":"crossref","unstructured":"Liu, F., Ren, X., Wu, X., Yang, B., Ge, S., Sun, X.: O2na: An object-oriented non-autoregressive approach for controllable video captioning. arXiv preprint arXiv:2108.02359 (2021)","DOI":"10.18653\/v1\/2021.findings-acl.24"},{"key":"1012_CR12","doi-asserted-by":"crossref","unstructured":"Zheng, Q., Wang, C., Tao, D.: Syntax-aware action targeting for video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13096\u201313105 (2020)","DOI":"10.1109\/CVPR42600.2020.01311"},{"key":"1012_CR13","doi-asserted-by":"crossref","unstructured":"Schuster, S., Krishna, R., Chang, A., Fei-Fei, L., Manning, C.D.: Generating semantically precise scene graphs from textual descriptions for improved image retrieval. In: Proceedings of the Fourth Workshop on Vision and Language, pp. 70\u201380 (2015)","DOI":"10.18653\/v1\/W15-2812"},{"key":"1012_CR14","doi-asserted-by":"crossref","unstructured":"Zhang, W., Wang, X.E., Tang, S., Shi, H., Shi, H., Xiao, J., Zhuang, Y., Wang, W.Y.: Relational graph learning for grounded video description generation. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 3807\u20133828 (2020)","DOI":"10.1145\/3394171.3413746"},{"issue":"2","key":"1012_CR15","doi-asserted-by":"publisher","first-page":"171","DOI":"10.1023\/A:1020346032608","volume":"50","author":"A Kojima","year":"2002","unstructured":"Kojima, A., Tamura, T., Fukunaga, K.: Natural language description of human activities from video images based on concept hierarchy of actions. Int. J. Comput. Vis. 50(2), 171\u2013184 (2002)","journal-title":"Int. J. Comput. Vis."},{"key":"1012_CR16","doi-asserted-by":"crossref","unstructured":"Guadarrama, S., Krishnamoorthy, N., Malkarnenkar, G., Venugopalan, S., Mooney, R., Darrell, T., Saenko, K.: Youtube2text: Recognizing and describing arbitrary activities using semantic hierarchies and zero-shot recognition. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2712\u20132719 (2013)","DOI":"10.1109\/ICCV.2013.337"},{"key":"1012_CR17","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Xu, H., Donahue, J., Rohrbach, M., Mooney, R., Saenko, K.: Translating videos to natural language using deep recurrent neural networks. arXiv preprint arXiv:1412.4729 (2014)","DOI":"10.3115\/v1\/N15-1173"},{"key":"1012_CR18","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Rohrbach, M., Donahue, J., Mooney, R., Darrell, T., Saenko, K.: Sequence to sequence-video to text. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4534\u20134542 (2015)","DOI":"10.1109\/ICCV.2015.515"},{"key":"1012_CR19","first-page":"64","volume":"5","author":"LR Medsker","year":"2001","unstructured":"Medsker, L.R., Jain, L.: Recurrent neural networks. Des. Appl. 5, 64\u201367 (2001)","journal-title":"Des. Appl."},{"key":"1012_CR20","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. In: Advances in Neural Information Processing Systems, pp. 5998\u20136008 (2017)"},{"key":"1012_CR21","doi-asserted-by":"crossref","unstructured":"Wang, B., Ma, L., Zhang, W., Jiang, W., Wang, J., Liu, W.: Controllable video captioning with POS sequence guidance based on gated fusion network. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2641\u20132650 (2019)","DOI":"10.1109\/ICCV.2019.00273"},{"key":"1012_CR22","doi-asserted-by":"crossref","unstructured":"Xiao, X., Wang, L., Fan, B., Xiang, S., Pan, C.: Guiding the flowing of semantics: Interpretable video captioning via POS tag. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pp. 2068\u20132077. Association for Computational Linguistics, Hong Kong, China (2019). 10.18653\/v1\/D19-1213. https:\/\/aclanthology.org\/D19-1213","DOI":"10.18653\/v1\/D19-1213"},{"key":"1012_CR23","unstructured":"Zhu, F., Hwang, J.-N., Ma, Z., Chen, G., Guo, J.: Ovc-net: Object-oriented video captioning with temporal graph and detail enhancement. arXiv preprint arXiv:2003.03715 (2020)"},{"key":"1012_CR24","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Shi, Y., Yuan, C., Li, B., Wang, P., Hu, W., Zha, Z.-J.: Object relational graph with teacher-recommended learning for video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13278\u201313288 (2020)","DOI":"10.1109\/CVPR42600.2020.01329"},{"key":"1012_CR25","doi-asserted-by":"crossref","unstructured":"Hou, J., Wu, X., Zhang, X., Qi, Y., Jia, Y., Luo, J.: Joint commonsense and relation reasoning for image and video captioning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34, pp. 10973\u201310980 (2020)","DOI":"10.1609\/aaai.v34i07.6731"},{"key":"1012_CR26","doi-asserted-by":"crossref","unstructured":"Zellers, R., Yatskar, M., Thomson, S., Choi, Y.: Neural motifs: scene graph parsing with global context. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5831\u20135840 (2018)","DOI":"10.1109\/CVPR.2018.00611"},{"key":"1012_CR27","unstructured":"Ghosh, S., Burachas, G., Ray, A., Ziskind, A.: Generating natural language explanations for visual question answering using scene graphs and visual attention. arXiv preprint arXiv:1902.05715 (2019)"},{"key":"1012_CR28","doi-asserted-by":"crossref","unstructured":"Schroeder, B., Tripathi, S.: Structured query-based image retrieval using scene graphs. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops, pp. 178\u2013179 (2020)","DOI":"10.1109\/CVPRW50498.2020.00097"},{"key":"1012_CR29","doi-asserted-by":"crossref","unstructured":"Chen, S., Jin, Q., Wang, P., Wu, Q.: Say as you wish: Fine-grained control of image caption generation with abstract scene graphs. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9962\u20139971 (2020)","DOI":"10.1109\/CVPR42600.2020.00998"},{"key":"1012_CR30","unstructured":"Milewski, V., Moens, M.-F., Calixto, I.: Are scene graphs good enough to improve image captioning? arXiv preprint arXiv:2009.12313 (2020)"},{"key":"1012_CR31","unstructured":"Xu, K., Ba, J., Kiros, R., Cho, K., Courville, A., Salakhudinov, R., Zemel, R., Bengio, Y.: Show, attend and tell: Neural image caption generation with visual attention. In: International Conference on Machine Learning, pp. 2048\u20132057 (2015). PMLR"},{"key":"1012_CR32","doi-asserted-by":"crossref","unstructured":"Li, Z., Li, Y., Lu, H.: Improve image captioning by self-attention. In: International Conference on Neural Information Processing, pp. 91\u201398 (2019). Springer","DOI":"10.1007\/978-3-030-36802-9_11"},{"key":"1012_CR33","doi-asserted-by":"crossref","unstructured":"Lu, J., Goswami, V., Rohrbach, M., Parikh, D., Lee, S.: 12-in-1: Multi-task vision and language representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10437\u201310446 (2020)","DOI":"10.1109\/CVPR42600.2020.01045"},{"key":"1012_CR34","doi-asserted-by":"crossref","unstructured":"Hu, R., Singh, A.: Unit: Multimodal multitask learning with a unified transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1439\u20131449 (2021)","DOI":"10.1109\/ICCV48922.2021.00147"},{"key":"1012_CR35","doi-asserted-by":"crossref","unstructured":"Zhu, L., Yang, Y.: Actbert: Learning global-local video-text representations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8746\u20138755 (2020)","DOI":"10.1109\/CVPR42600.2020.00877"},{"key":"1012_CR36","unstructured":"Luo, H., Ji, L., Shi, B., Huang, H., Duan, N., Li, T., Li, J., Bharti, T., Zhou, M.: Univl: A unified video and language pre-training model for multimodal understanding and generation. arXiv preprint arXiv:2002.06353 (2020)"},{"key":"1012_CR37","unstructured":"Akbari, H., Yuan, L., Qian, R., Chuang, W.-H., Chang, S.-F., Cui, Y., Gong, B.: Vatt: Transformers for multimodal self-supervised learning from raw video, audio and text. Advances in Neural Information Processing Systems 34 (2021)"},{"key":"1012_CR38","doi-asserted-by":"crossref","unstructured":"Song, Y., Redi, M., Vallmitjana, J., Jaimes, A.: To click or not to click: Automatic selection of beautiful thumbnails from videos. In: Proceedings of the 25th ACM International on Conference on Information and Knowledge Management, pp. 659\u2013668 (2016)","DOI":"10.1145\/2983323.2983349"},{"key":"1012_CR39","doi-asserted-by":"crossref","unstructured":"Tang, K., Niu, Y., Huang, J., Shi, J., Zhang, H.: Unbiased scene graph generation from biased training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3716\u20133725 (2020)","DOI":"10.1109\/CVPR42600.2020.00377"},{"issue":"1","key":"1012_CR40","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., Zhu, Y., Groth, O., Johnson, J., Hata, K., Kravitz, J., Chen, S., Kalantidis, Y., Li, L.-J., Shamma, D.A., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. 123(1), 32\u201373 (2017)","journal-title":"Int. J. Comput. Vis."},{"key":"1012_CR41","doi-asserted-by":"crossref","unstructured":"Ge, R., Gao, J., Chen, K., Nevatia, R.: Mac: Mining activity concepts for language-based temporal localization. In: 2019 IEEE Winter Conference on Applications of Computer Vision (WACV), pp. 245\u2013253 (2019). IEEE","DOI":"10.1109\/WACV.2019.00032"},{"key":"1012_CR42","doi-asserted-by":"crossref","unstructured":"Li, M., Chen, X., Gao, S., Chan, Z., Zhao, D., Yan, R.: Vmsmo: Learning to generate multimodal summary for video-based news articles. arXiv preprint arXiv:2010.05406 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.752"},{"key":"1012_CR43","doi-asserted-by":"crossref","unstructured":"Anderson, P., He, X., Buehler, C., Teney, D., Johnson, M., Gould, S., Zhang, L.: Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"1012_CR44","unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473 (2014)"},{"key":"1012_CR45","doi-asserted-by":"crossref","unstructured":"Sigurdsson, G.A., Varol, G., Wang, X., Farhadi, A., Laptev, I., Gupta, A.: Hollywood in homes: Crowdsourcing data collection for activity understanding. In: European Conference on Computer Vision, pp. 510\u2013526 (2016). Springer","DOI":"10.1007\/978-3-319-46448-0_31"},{"key":"1012_CR46","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"1012_CR47","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.-J., Li, K., Fei-Fei, L.: Imagenet: A large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255 (2009). IEEE","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"1012_CR48","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.-J.: Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"1012_CR49","unstructured":"Lin, C.-Y.: Rouge: A package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381 (2004)"},{"key":"1012_CR50","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., Parikh, D.: Cider: Consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"1012_CR51","unstructured":"Banerjee, S., Lavie, A.: Meteor: An automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization, pp. 65\u201372 (2005)"},{"key":"1012_CR52","doi-asserted-by":"crossref","unstructured":"Hu, Y., Chen, Z., Zha, Z.-J., Wu, F.: Hierarchical global-local temporal modeling for video captioning. In: Proceedings of the 27th ACM International Conference on Multimedia, pp. 774\u2013783 (2019)","DOI":"10.1145\/3343031.3351072"},{"issue":"11","key":"1012_CR53","doi-asserted-by":"publisher","first-page":"5552","DOI":"10.1109\/TIP.2019.2916757","volume":"28","author":"B Zhao","year":"2019","unstructured":"Zhao, B., Li, X., Lu, X.: CAM-RNN: co-attention model based RNN for video captioning. IEEE Trans. Image Process. 28(11), 5552\u20135565 (2019)","journal-title":"IEEE Trans. Image Process."},{"key":"1012_CR54","doi-asserted-by":"crossref","unstructured":"Xiong, Y., Dai, B., Lin, D.: Move forward and tell: A progressive generator of video descriptions. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 468\u2013483 (2018)","DOI":"10.1007\/978-3-030-01252-6_29"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-022-01012-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-022-01012-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-022-01012-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,2,27]],"date-time":"2023-02-27T19:12:32Z","timestamp":1677525152000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-022-01012-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,11,14]]},"references-count":54,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2023,4]]}},"alternative-id":["1012"],"URL":"https:\/\/doi.org\/10.1007\/s00530-022-01012-7","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,11,14]]},"assertion":[{"value":"30 June 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 October 2022","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 November 2022","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}