{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T16:15:09Z","timestamp":1759335309543,"version":"3.40.3"},"publisher-location":"Cham","reference-count":51,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031398209"},{"type":"electronic","value":"9783031398216"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-39821-6_7","type":"book-chapter","created":{"date-parts":[[2023,8,15]],"date-time":"2023-08-15T21:01:25Z","timestamp":1692133285000},"page":"98-112","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Efficient Video Captioning with\u00a0Frame Similarity-Based Filtering"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0813-3417","authenticated-orcid":false,"given":"Elyas","family":"Rashno","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3326-0875","authenticated-orcid":false,"given":"Farhana","family":"Zulkernine","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,8,16]]},"reference":[{"key":"7_CR1","doi-asserted-by":"crossref","unstructured":"Donahue, J., et al.: Long-term recurrent convolutional networks for visual recognition and description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"7_CR2","unstructured":"Janssens, R., Demeester, T., Belpaeme, T.: Visual conversation starters for human-robot interaction (2022)"},{"key":"7_CR3","doi-asserted-by":"crossref","unstructured":"Jiang, L., Ladner, R.: Co-designing systems to support blind and low vision audio description writers. In: Proceedings of the 24th International ACM SIGACCESS Conference on Computers and Accessibility, pp. 1\u20133 (2022)","DOI":"10.1145\/3517428.3550394"},{"key":"7_CR4","doi-asserted-by":"publisher","first-page":"92166","DOI":"10.1109\/ACCESS.2022.3202526","volume":"10","author":"RS Bhooshan","year":"2022","unstructured":"Bhooshan, R.S., Suresh, K.: A multimodal framework for video caption generation. IEEE Access 10, 92166\u201392176 (2022)","journal-title":"IEEE Access"},{"key":"7_CR5","doi-asserted-by":"crossref","unstructured":"Guadarrama, S., et al.: YouTube2Text: recognizing and describing arbitrary activities using semantic hierarchies and zero-shot recognition. In: International Conference on Computer Vision (ICCV) (2013)","DOI":"10.1109\/ICCV.2013.337"},{"key":"7_CR6","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Xu, H., Donahue, J., Rohrbach, M., Mooney, R., Saenko, K.: Translating videos to natural language using deep recurrent neural networks. In: Conference of the North American Chapter of the Association for Computational Linguistics (NAACL) (2015)","DOI":"10.3115\/v1\/N15-1173"},{"key":"7_CR7","doi-asserted-by":"crossref","unstructured":"Yao, L., et al.: Describing videos by exploiting temporal structure. arXiv preprint arXiv:1502.08029v4 (2015)","DOI":"10.1109\/ICCV.2015.512"},{"key":"7_CR8","doi-asserted-by":"crossref","unstructured":"Hu, Y., Luo, C., Chen, Z.: Make it move: controllable image-to-video generation with text descriptions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18219\u201318228 (2022)","DOI":"10.1109\/CVPR52688.2022.01768"},{"issue":"8","key":"7_CR9","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"7_CR10","unstructured":"Graves, A., Jaitly, N.: Towards end-to-end speech recognition with recurrent neural networks. In: International Conference on Machine Learning (ICML) (2014)"},{"key":"7_CR11","doi-asserted-by":"crossref","unstructured":"Mahfuz, S., Isah, H., Zulkernine, F., Nicholls, P.: Detecting irregular patterns in IoT streaming data for fall detection. In: 2018 IEEE 9th Annual Information Technology, Electronics and Mobile Communication Conference (IEMCON), pp. 588\u2013594. IEEE (2018)","DOI":"10.1109\/IEMCON.2018.8614822"},{"key":"7_CR12","unstructured":"Sutskever, I., Vinyals, O., Le, Q.V.: Sequence to sequence learning with neural networks. In: Conference on Neural Information Processing Systems (NIPS) (2014)"},{"key":"7_CR13","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Rohrbach, M., Donahue, J., Mooney, R.J., Darrell, T., Saenko, K.: Sequence to sequence-video to text. In: ICCV, pp. 4534\u20134542 (2015)","DOI":"10.1109\/ICCV.2015.515"},{"key":"7_CR14","unstructured":"Chen, D.L., Dolan, W.B.: Collecting highly parallel data for paraphrase evaluation. In: ACL (2011)"},{"key":"7_CR15","unstructured":"Torabi, A., Pal, C., Larochelle, H., Courville, A.: Using descriptive video services to create a large data source for video annotation research. arXiv preprint arXiv:1503.01070 (2015)"},{"key":"7_CR16","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Rohrbach, M., Tandon, N., Schiele, B.: A dataset for movie description. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"7_CR17","doi-asserted-by":"crossref","unstructured":"Wang, H., Zhang, Y., Yu, X., et al.: An overview of image caption generation methods. Comput. Intell. Neurosci. 2020 (2020)","DOI":"10.1155\/2020\/3062706"},{"key":"7_CR18","doi-asserted-by":"crossref","unstructured":"Lee, M.W., Hakeem, A., Haering, N., Zhu, S.: SAVE: a framework for semantic annotation of visual events. In: CVPR, pp. 1\u20138 (2008)","DOI":"10.1109\/CVPRW.2008.4562954"},{"key":"7_CR19","doi-asserted-by":"publisher","first-page":"171","DOI":"10.1023\/A:1020346032608","volume":"50","author":"A Kojima","year":"2002","unstructured":"Kojima, A., Tamura, T., Fukunaga, K.: Natural language description of human activities from video images based on concept hierarchy of actions. Int. J. Comput. Vision 50, 171\u2013184 (2002)","journal-title":"Int. J. Comput. Vision"},{"key":"7_CR20","doi-asserted-by":"crossref","unstructured":"Khan, M.U.G., Zhang, L., Gotoh, Y.: Human focused video description. In: ICCV, pp. 1480\u20131487 (2011)","DOI":"10.1109\/ICCVW.2011.6130425"},{"key":"7_CR21","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"372","DOI":"10.1007\/978-3-642-33863-2_37","volume-title":"Computer Vision \u2013 ECCV 2012. Workshops and Demonstrations","author":"P Hanckmann","year":"2012","unstructured":"Hanckmann, P., Schutte, K., Burghouts, G.J.: Automated textual descriptions for a wide range of video events with 48 human actions. In: Fusiello, A., Murino, V., Cucchiara, R. (eds.) ECCV 2012. LNCS, vol. 7583, pp. 372\u2013380. Springer, Heidelberg (2012). https:\/\/doi.org\/10.1007\/978-3-642-33863-2_37"},{"key":"7_CR22","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"15","DOI":"10.1007\/978-3-642-15561-1_2","volume-title":"Computer Vision \u2013 ECCV 2010","author":"A Farhadi","year":"2010","unstructured":"Farhadi, A., Hejrati, M., Sadeghi, M.A., Young, P., Rashtchian, C., Hockenmaier, J., Forsyth, D.: Every picture tells a story: generating sentences from images. In: Daniilidis, K., Maragos, P., Paragios, N. (eds.) ECCV 2010. LNCS, vol. 6314, pp. 15\u201329. Springer, Heidelberg (2010). https:\/\/doi.org\/10.1007\/978-3-642-15561-1_2"},{"key":"7_CR23","doi-asserted-by":"crossref","unstructured":"Rohrbach, M., Qiu, W., Titov, I., Thater, S., Pinkal, M., Schiele, B: Translating video content to natural language descriptions. In: ICCV, pp. 433\u2013440 (2013)","DOI":"10.1109\/ICCV.2013.61"},{"key":"7_CR24","doi-asserted-by":"crossref","unstructured":"Guo, Z., Gao, L., Song, J., Xu, X., Shao, J., Shen, H.T.: Attention-based LSTM with semantic consistency for videos captioning. In: ACM MM, pp. 357\u2013361 (2016)","DOI":"10.1145\/2964284.2967242"},{"key":"7_CR25","doi-asserted-by":"crossref","unstructured":"Pan, Y., Mei, T., Yao, T., Li, H., Rui, Y.: Jointly modeling embedding and translation to bridge video and language. In: CVPR, pp. 4594\u20134602 (2016)","DOI":"10.1109\/CVPR.2016.497"},{"key":"7_CR26","doi-asserted-by":"crossref","unstructured":"Yao, L., et al.: Describing videos by exploiting temporal structure. In: ICCV, pp. 4507\u20134515 (2015)","DOI":"10.1109\/ICCV.2015.512"},{"key":"7_CR27","doi-asserted-by":"crossref","unstructured":"Pan, Y., Yao, T., Li, H., Mei, T.: Video captioning with transferred semantic attributes. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.111"},{"key":"7_CR28","unstructured":"Long, X., Gan, C.-Y., de Melo, G.: Video captioning with multi-faceted attention. CoRR, abs\/1612.00234 (2016)"},{"key":"7_CR29","doi-asserted-by":"crossref","unstructured":"Song, J., Gao, L., Guo, Z., Liu, W., Zhang, D., Shen, H.T.: Hierarchical LSTM with adjusted temporal attention for video captioning. In: Proceedings of the Twenty-Sixth International Joint Conference on Artificial Intelligence, IJCAI 2017, pp. 2737\u20132743 (2017)","DOI":"10.24963\/ijcai.2017\/381"},{"key":"7_CR30","doi-asserted-by":"crossref","unstructured":"Yu, Y., Choi, J.-S., Kim, Y., Yoo, K., Lee, S., Kim, G.: Supervising neural attention models for video captioning by human gaze data. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.648"},{"key":"7_CR31","unstructured":"Zeiler, M.D.: AdaDelta: an adaptive learning rate method. arXiv preprint arXiv:1212.5701 (2012)"},{"key":"7_CR32","doi-asserted-by":"crossref","unstructured":"Yang, A., et al.: Vid2Seq: large-scale pretraining of a visual language model for dense video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10714\u201310726 (2023)","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"7_CR33","unstructured":"Wang, Z., et al.: Language models with image descriptors are strong few-shot video-language learners. arXiv preprint arXiv:2205.10747 (2022)"},{"key":"7_CR34","doi-asserted-by":"crossref","unstructured":"Li, M., et al.: Clip-event: connecting text and images with event structures. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16420\u201316429 (2022)","DOI":"10.1109\/CVPR52688.2022.01593"},{"key":"7_CR35","unstructured":"Yang, Y., Hospedales, T.: Deep multi-task representation learning: a tensor factorisation approach. In: International Conference on Learning Representations (2017)"},{"key":"7_CR36","unstructured":"Barrow, H.G., Tenenbaum, J.M., Bolles, R.C., Wolf, H.C.: Parametric correspondence and chamfer matching: two new techniques for image matching. Technical report, SRI AI Center (1977)"},{"key":"7_CR37","unstructured":"Zaremba, W., Sutskever, I.: Learning to execute. arXiv preprint arXiv:1410.4615 (2014)"},{"issue":"3","key":"7_CR38","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1016\/0098-3004(93)90090-R","volume":"19","author":"A Ma\u0107kiewicz","year":"1993","unstructured":"Ma\u0107kiewicz, A., Ratajczak, W.: Principal components analysis (PCA). Comput. Geosci. 19(3), 303\u2013342 (1993)","journal-title":"Comput. Geosci."},{"key":"7_CR39","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1007\/978-1-4842-6168-2_4","volume-title":"Convolutional Neural Networks with Swift for Tensorflow","author":"B Koonce","year":"2021","unstructured":"Koonce, B., Koonce, B.: VGG network. In: Koonce, B. (ed.) Convolutional Neural Networks with Swift for Tensorflow, pp. 35\u201350. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-1-4842-6168-2_4"},{"key":"7_CR40","unstructured":"Krizhevsky, A.: One weird trick for parallelizing convolutional neural networks. CoRR, abs\/1404.5997 (2014)"},{"key":"7_CR41","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Zitnick, C.L., Parikh, D.: CIDEr: consensus-based image description evaluation. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"7_CR42","doi-asserted-by":"crossref","unstructured":"Denkowski, M., Lavie, A.: Meteor universal: language specific translation evaluation for any target language. In: EACL (2014)","DOI":"10.3115\/v1\/W14-3348"},{"key":"7_CR43","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.-J.: BLEU: a method for automatic evaluation of machine translation. In: ACL (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"7_CR44","unstructured":"Lin, C.-Y.: ROUGE: a package for automatic evaluation of summaries. In: Text Summarization Branches Out: Proceedings of the ACL-04 Workshop, pp. 74\u201381 (2004)"},{"key":"7_CR45","unstructured":"Chen, X., et al.: Microsoft COCO captions: data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)"},{"key":"7_CR46","unstructured":"Thomason, J., Venugopalan, S., Guadarrama, S., Saenko, K., Mooney, R.J.: Integrating language and vision to generate natural language descriptions of videos in the wild. In: COLING (2014)"},{"key":"7_CR47","unstructured":"Hodosh, M., Young, P., Lai, A., Hockenmaier, J.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. In: TACL (2014)"},{"key":"7_CR48","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Xu, H., Donahue, J., Rohrbach, M., Mooney, R., Saenko, K.: Translating videos to natural language using deep recurrent neural networks. In: NAACL (2015)","DOI":"10.3115\/v1\/N15-1173"},{"key":"7_CR49","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"7_CR50","doi-asserted-by":"crossref","unstructured":"Szegedy, C., et al.: Going deeper with convolutions. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"7_CR51","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"209","DOI":"10.1007\/978-3-319-24947-6_17","volume-title":"Pattern Recognition","author":"A Rohrbach","year":"2015","unstructured":"Rohrbach, A., Rohrbach, M., Schiele, B.: The long-short story of movie description. In: Gall, J., Gehler, P., Leibe, B. (eds.) GCPR 2015. LNCS, vol. 9358, pp. 209\u2013221. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24947-6_17"}],"container-title":["Lecture Notes in Computer Science","Database and Expert Systems Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-39821-6_7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:09:48Z","timestamp":1729922988000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-39821-6_7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031398209","9783031398216"],"references-count":51,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-39821-6_7","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"16 August 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"DEXA","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Database and Expert Systems Applications","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Penang","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Malaysia","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30 August 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"34","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"dexa2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.dexa.org\/dexa2023","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EquinOCS","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"155","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"49","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"35","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"32% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"For the workshops 7 full and 3 short papers have been accepted from 20 submissions","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}