{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T05:28:29Z","timestamp":1742966909334,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":35,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819620708"},{"type":"electronic","value":"9789819620715"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-2071-5_17","type":"book-chapter","created":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T15:33:50Z","timestamp":1735745630000},"page":"226-239","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Towards Visual Storytelling by\u00a0Understanding Narrative Context Through Scene-Graphs"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1295-777X","authenticated-orcid":false,"given":"Itthisak","family":"Phueaksri","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9193-5973","authenticated-orcid":false,"given":"Marc A.","family":"Kastner","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3799-4550","authenticated-orcid":false,"given":"Yasutomo","family":"Kawanishi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3041-4330","authenticated-orcid":false,"given":"Takahiro","family":"Komamizu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3942-9296","authenticated-orcid":false,"given":"Ichiro","family":"Ide","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,1,2]]},"reference":[{"key":"17_CR1","unstructured":"Banerjee, S., Lavie, A.: METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of 43rd Annual Meeting of the Association for Computational Linguistics, pp. 65\u201372 (2005)"},{"key":"17_CR2","doi-asserted-by":"crossref","unstructured":"Chen, H., Huang, Y., Takamura, H., Nakayama, H.: Commonsense knowledge aware concept selection for diverse and informative visual storytelling. In: Proceedings of 35th AAAI Conference on Artificial Intelligence, pp. 999\u20131008 (2021)","DOI":"10.1609\/aaai.v35i2.16184"},{"key":"17_CR3","unstructured":"Chen, W., Li, X., Su, J., Zhu, G., Li, Y., Ji, Y., Liu, C.: TARN-VIST: Topic aware reinforcement network for VIsual STorytelling. In: Proceedings of 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation, pp. 15617\u201315628 (2024)"},{"issue":"9","key":"17_CR4","doi-asserted-by":"publisher","first-page":"11169","DOI":"10.1109\/TPAMI.2023.3268066","volume":"45","author":"Y Cong","year":"2023","unstructured":"Cong, Y., Yang, M.Y., Rosenhahn, B.: RelTR: relation TRansformer for scene graph generation. IEEE Trans. Pattern Anal. Mach. Intell. 45(9), 11169\u201311183 (2023)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"17_CR5","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: Proceedings 2009 IEEE Computer Society Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255 (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"17_CR6","doi-asserted-by":"crossref","unstructured":"Dunne, C., Shneiderman, B.: Motif Simplification: improving network visualization readability with fan, connector, and clique glyphs. In: Proceedings of 31st Annual SIGCHI Conference on Human Factors in Computing Systems, pp. 3247\u20133256 (2013)","DOI":"10.1145\/2470654.2466444"},{"key":"17_CR7","doi-asserted-by":"crossref","unstructured":"Girshick, R.: Fast R-CNN. In: Proceedings of 15th IEEE International Conference on Computer Vision, pp. 1440\u20131448 (2015)","DOI":"10.1109\/ICCV.2015.169"},{"key":"17_CR8","doi-asserted-by":"crossref","unstructured":"Graves, A., Schmidhuber, J.: Bidirectional LSTM networks for improved phoneme classification and recognition. In: Proceedings of 15th International Conference on Artificial Neural Networks, pp. 799\u2013804 (2005)","DOI":"10.1007\/11550907_126"},{"key":"17_CR9","unstructured":"Han, X., Yang, J., Hu, H., Zhang, L., Gao, J., Zhang, P.: Image Scene Graph Generation (SGG) benchmark. Comput. Res. Reposit. arXiv Preprint, arXiv:2107.12604 (Jul 2021)"},{"key":"17_CR10","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of 2016 IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"17_CR11","doi-asserted-by":"crossref","unstructured":"Hsu, C.C., et al.: Knowledge-enriched visual storytelling. In: Proceedings of 34th AAAI Conference on Artificial Intelligence, pp. 7952\u20137960 (2020)","DOI":"10.1609\/aaai.v34i05.6303"},{"key":"17_CR12","doi-asserted-by":"crossref","unstructured":"Hsu, C.Y., Chu, Y.W., Huang, T.H., Ku, L.W.: Plot and rework: modeling storylines for visual storytelling. In: Proceedings of 2021 Findings Association for Computational Linguistics, 11th International Joint Conference on Natural Language Processing, pp. 4443\u20134453 (2021)","DOI":"10.18653\/v1\/2021.findings-acl.390"},{"key":"17_CR13","unstructured":"Huang, T.K., et al.: Visual storytelling. In: Proceedings of 15th North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 1233\u20131239 (2016)"},{"key":"17_CR14","unstructured":"Kim, T., Heo, M., Son, S., Park, K., Zhang, B.: GLAC Net: GLocal attention cascading networks for multi-image cued story generation. In: Proceedings of 17th North American Chapter of the Association for Computational Linguistics (Workshop), pp.\u00a01\u20136 (2018)"},{"key":"17_CR15","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: Proceedings of 3rd International Conference on Learning Representations, pp. 1\u201313 (2014)"},{"issue":"1","key":"17_CR16","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual Genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. 123(1), 32\u201373 (2017)","journal-title":"Int. J. Comput. Vis."},{"issue":"7","key":"17_CR17","doi-asserted-by":"publisher","first-page":"1956","DOI":"10.1007\/s11263-020-01316-z","volume":"128","author":"A Kuznetsova","year":"2020","unstructured":"Kuznetsova, A., et al.: The Open Images dataset V4: unified image classification, object detection, and visual relationship detection at scale. Int. J. Comput. Vis. 128(7), 1956\u20131981 (2020)","journal-title":"Int. J. Comput. Vis."},{"issue":"7","key":"17_CR18","doi-asserted-by":"publisher","first-page":"8634","DOI":"10.1109\/TPAMI.2022.3230934","volume":"45","author":"T Li","year":"2022","unstructured":"Li, T., Wang, H., He, B., Chen, C.W.: Knowledge-enriched attention network with group-wise semantic for visual storytelling. IEEE Trans. Pattern Anal. Mach. Intell. 45(7), 8634\u20138645 (2022)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"126486","key":"17_CR19","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.neucom.2023.02.003","volume":"552","author":"H Liu","year":"2023","unstructured":"Liu, H., et al.: AOG-LSTM: an adaptive attention neural network for visual storytelling. Neurocomputing 552(126486), 1\u201313 (2023)","journal-title":"Neurocomputing"},{"issue":"2","key":"17_CR20","doi-asserted-by":"publisher","first-page":"381","DOI":"10.3758\/BRM.42.2.381","volume":"42","author":"PM McCarthy","year":"2010","unstructured":"McCarthy, P.M., Jarvis, S.: MTLD, VOCD-D, and HD-D: a validation study of sophisticated approaches to lexical diversity assessment. Behav. Res. Methods 42(2), 381\u2013392 (2010)","journal-title":"Behav. Res. Methods"},{"key":"17_CR21","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: Proceeding of 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"17_CR22","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.D.: GloVe: global vectors for word representation. In: Proceedings of 2014 Conference on Empirical Methods in Natural Language Processing, pp. 1532\u20131543 (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"17_CR23","doi-asserted-by":"crossref","unstructured":"Sellam, T., Das, D., Parikh, A.P.: BLEURT: Learning robust metrics for text generation. In: Proceedings of 58th Annual Meeting of the Association for Computational Linguistics, pp. 7881\u20137892 (2020)","DOI":"10.18653\/v1\/2020.acl-main.704"},{"key":"17_CR24","doi-asserted-by":"crossref","unstructured":"Speer, R., Chin, J., Havasi, C.: ConceptNet 5.5: an open multilingual graph of general knowledge. In: Proceedings of 31st AAAI Conference on Artificial Intelligence, pp. 4444\u20134451 (2017)","DOI":"10.1609\/aaai.v31i1.11164"},{"key":"17_CR25","doi-asserted-by":"crossref","unstructured":"Tang, K., Niu, Y., Huang, J., Shi, J., Zhang, H.: Unbiased scene graph generation from biased training. In: Proceedings of 2020 IEEE Conference on Computer Vision and Pattern Recognition, pp. 3716\u20133725 (2020)","DOI":"10.1109\/CVPR42600.2020.00377"},{"key":"17_CR26","doi-asserted-by":"crossref","unstructured":"Wang, B., Ma, L., Zhang, W., Jiang, W., Zhang, F.: Hierarchical photo-scene encoder for album storytelling. In: Proceedings of 33rd AAAI Conference on Artificial Intelligence, pp. 8909\u20138916 (2019)","DOI":"10.1609\/aaai.v33i01.33018909"},{"key":"17_CR27","unstructured":"Wang, E., Han, C., Poon, J.: SCO-VIST: social interaction COmmonsense knowledge-based VIsual STorytelling. In: Proceedings of 18th Conference of the European Chapter of the Association for Computational Linguistics, pp. 1602\u20131616 (2024)"},{"key":"17_CR28","doi-asserted-by":"crossref","unstructured":"Wang, E., Han, S.C., Poon, J.: RoViST: learning robust metrics for visual STorytelling. In: Proceedings of 13th Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 2691\u20132702 (2022)","DOI":"10.18653\/v1\/2022.findings-naacl.206"},{"key":"17_CR29","doi-asserted-by":"crossref","unstructured":"Wang, R., Wei, Z., Li, P., Zhang, Q., Huang, X.: Storytelling from an image stream using scene graphs. In: Proc. 34th AAAI Conference on Artificial Intelligence & 32nd Innovative Applications of Artificial Intelligence Conference, pp. 9185\u20139192 (2020)","DOI":"10.1609\/aaai.v34i05.6455"},{"key":"17_CR30","doi-asserted-by":"crossref","unstructured":"Wang, X., Chen, W., Wang, Y., Wang, W.Y.: No metrics are perfect: adversarial reward learning for visual storytelling. In: Proceedings of 56th Annual Meeting of the Association for Computational Linguistics, pp. 899\u2013909 (2018)","DOI":"10.18653\/v1\/P18-1083"},{"key":"17_CR31","doi-asserted-by":"crossref","unstructured":"Xie, S., Girshick, R., Doll\u00e1r, P., Tu, Z., He, K.: Aggregated residual transformations for deep neural networks. In: Proceedings of 2017 IEEE Conference on Computer Vision and Pattern Recognition, pp. 1492\u20131500 (2017)","DOI":"10.1109\/CVPR.2017.634"},{"key":"17_CR32","doi-asserted-by":"crossref","unstructured":"Xu, C., Yang, M., Li, C., Shen, Y., Ao, X., Xu, R.: Imagine, reason and write: visual storytelling with graph knowledge and relational reasoning. In: Proceedings of 35th AAAI Conference on Artificial Intelligence, pp. 3022\u20133029 (2021)","DOI":"10.1609\/aaai.v35i4.16410"},{"key":"17_CR33","doi-asserted-by":"crossref","unstructured":"Yu, L., Bansal, M., Berg, T.L.: Hierarchically-attentive RNN for album summarization and storytelling. In: Proceedings of 2017 Conference on Empirical Methods in Natural Language Processing, pp. 966\u2013971 (2017)","DOI":"10.18653\/v1\/D17-1101"},{"key":"17_CR34","doi-asserted-by":"crossref","unstructured":"Zellers, R., Yatskar, M., Thomson, S., Choi, Y.: Neural Motifs: scene graph parsing with global context. In: Proceedings of 2018 IEEE Conference on Computer Vision and Pattern Recognition, pp. 5831\u20135840 (2018)","DOI":"10.1109\/CVPR.2018.00611"},{"key":"17_CR35","doi-asserted-by":"crossref","unstructured":"Zhang, M., Cui, Z., Neumann, M., Chen, Y.: An end-to-end deep learning architecture for graph classification. In: Proceedings of 32nd AAAI Conference on Artificial Intelligence, pp. 4438\u20134445 (2018)","DOI":"10.1609\/aaai.v32i1.11782"}],"container-title":["Lecture Notes in Computer Science","MultiMedia Modeling"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-2071-5_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T16:02:41Z","timestamp":1735747361000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-2071-5_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819620708","9789819620715"],"references-count":35,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-2071-5_17","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"2 January 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"MMM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Multimedia Modeling","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Nara","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Japan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 January 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 January 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"mmm2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/mmm2025.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}