{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T20:46:59Z","timestamp":1743108419271,"version":"3.40.3"},"publisher-location":"Cham","reference-count":18,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031814037"},{"type":"electronic","value":"9783031814044"}],"license":[{"start":{"date-parts":[[2024,12,31]],"date-time":"2024-12-31T00:00:00Z","timestamp":1735603200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,31]],"date-time":"2024-12-31T00:00:00Z","timestamp":1735603200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-81404-4_15","type":"book-chapter","created":{"date-parts":[[2024,12,31]],"date-time":"2024-12-31T00:38:45Z","timestamp":1735605525000},"page":"195-208","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["InVideo Search: Scene Description Clustering and\u00a0Integrating Image and\u00a0Audio Captioning for\u00a0Enhanced Video Search"],"prefix":"10.1007","author":[{"given":"Almira Asif","family":"Khan","sequence":"first","affiliation":[]},{"family":"Muhammed","sequence":"additional","affiliation":[]},{"given":"Asher Mathews","family":"Shaji","sequence":"additional","affiliation":[]},{"given":"Devika","family":"Sujith","sequence":"additional","affiliation":[]},{"given":"Aneesh G.","family":"Nath","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9498-1230","authenticated-orcid":false,"given":"Sandeep S.","family":"Udmale","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,31]]},"reference":[{"issue":"11","key":"15_CR1","doi-asserted-by":"publisher","first-page":"1838","DOI":"10.1109\/JPROC.2021.3117472","volume":"109","author":"E Apostolidis","year":"2021","unstructured":"Apostolidis, E., Adamantidou, E., Metsai, A.I., Mezaris, V., Patras, I.: Video summarization using deep neural networks: a survey. Proc. IEEE 109(11), 1838\u20131863 (2021)","journal-title":"Proc. IEEE"},{"key":"15_CR2","doi-asserted-by":"publisher","first-page":"45219","DOI":"10.1109\/ACCESS.2022.3169781","volume":"10","author":"JW Bae","year":"2022","unstructured":"Bae, J.W., Lee, S.H., Kim, W.Y., Seong, J.H., Seo, D.H.: Image captioning model using part-of-speech guidance module for description with diverse vocabulary. IEEE Access 10, 45219\u201345229 (2022)","journal-title":"IEEE Access"},{"issue":"1","key":"15_CR3","first-page":"59","volume":"5","author":"A Berhe","year":"2019","unstructured":"Berhe, A., Barras, C., Guinaudeau, C.: Video scene segmentation of tv series using multimodal neural features. Series-Int. J. TV Serial Narratives 5(1), 59\u201368 (2019)","journal-title":"Series-Int. J. TV Serial Narratives"},{"key":"15_CR4","doi-asserted-by":"crossref","unstructured":"Blagec, K., Dorffner, G., Moradi, M., Ott, S., Samwald, M.: A global analysis of metrics used for measuring performance in natural language processing. arXiv preprint arXiv:2204.11574 (2022)","DOI":"10.18653\/v1\/2022.nlppower-1.6"},{"issue":"6","key":"15_CR5","doi-asserted-by":"publisher","first-page":"633","DOI":"10.1016\/j.cviu.2013.01.013","volume":"117","author":"JM Chaquet","year":"2013","unstructured":"Chaquet, J.M., Carmona, E.J., Fern\u00e1ndez-Caballero, A.: A survey of video datasets for human action and activity recognition. Comput. Vis. Image Underst. 117(6), 633\u2013659 (2013)","journal-title":"Comput. Vis. Image Underst."},{"key":"15_CR6","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding (2019). https:\/\/arxiv.org\/abs\/1810.04805"},{"key":"15_CR7","doi-asserted-by":"crossref","unstructured":"Xiao, F., Zhu, Q., Guan, J., Wang, W.: Enhancing audio retrieval with attention-based encoder for audio feature representation. In: EUSIPCO 2023 (2023)","DOI":"10.23919\/EUSIPCO58844.2023.10290096"},{"key":"15_CR8","unstructured":"Han, M., Yang, L., Chang, X., Wang, H.: Shot2story20k: a new benchmark for comprehensive understanding of multi-shot videos. arXiv preprint arXiv:2311.17043 (2023)"},{"key":"15_CR9","unstructured":"Iyer, R.R., Parekh, S., Mohandoss, V., Ramsurat, A., Raj, B., Singh, R.: Content-based video indexing and retrieval using CORR-LDA. arXiv preprint arXiv:1602.08581 (2016)"},{"key":"15_CR10","doi-asserted-by":"crossref","unstructured":"Jadon, S., Jasim, M.: Unsupervised video summarization framework using keyframe extraction and video skimming. In: 2020 IEEE 5th International Conference on Computing Communication and Automation (ICCCA), pp. 140\u2013145. IEEE (2020)","DOI":"10.1109\/ICCCA49541.2020.9250764"},{"key":"15_CR11","doi-asserted-by":"crossref","unstructured":"Drossos, K., Adavanne, S., Virtanen, T.: Automated audio captioning with recurrent neural networks. In: IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA). IEEE (2017)","DOI":"10.1109\/WASPAA.2017.8170058"},{"key":"15_CR12","doi-asserted-by":"crossref","unstructured":"Labb, E., Pellegrini, T., Pinquier, J., et al.: Conette: an efficient audio captioning system leveraging multiple datasets with task embedding. IEEE\/ACM Trans. Audio Speech Lang. Process. (2024)","DOI":"10.1109\/TASLP.2024.3430813"},{"key":"15_CR13","doi-asserted-by":"crossref","unstructured":"Peng, M., Wang, C., Gao, Y., Shi, Y., Zhou, X.D.: Multilevel hierarchical network with multiscale sampling for video question answering. arXiv preprint arXiv:2205.04061 (2022)","DOI":"10.24963\/ijcai.2022\/178"},{"key":"15_CR14","doi-asserted-by":"crossref","unstructured":"Ryu, H., Kang, S., Kang, H., Yoo, C.D.: Semantic grouping network for video captioning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a035, pp. 2514\u20132522 (2021)","DOI":"10.1609\/aaai.v35i3.16353"},{"issue":"1","key":"15_CR15","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s40537-021-00479-x","volume":"8","author":"EM Saoudi","year":"2021","unstructured":"Saoudi, E.M., Jai-Andaloussi, S.: A distributed content-based video retrieval system for large datasets. J. Big Data 8(1), 1\u201326 (2021)","journal-title":"J. Big Data"},{"key":"15_CR16","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3156\u20133164 (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"15_CR17","doi-asserted-by":"crossref","unstructured":"Yang, X., Liu, Y., Wang, X.: Reformer: the relational transformer for image captioning. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 5398\u20135406 (2022)","DOI":"10.1145\/3503161.3548409"},{"key":"15_CR18","doi-asserted-by":"publisher","first-page":"30730","DOI":"10.1109\/ACCESS.2022.3160214","volume":"10","author":"H Yoon","year":"2022","unstructured":"Yoon, H., Han, J.H.: Content-based video retrieval with prototypes of deep features. IEEE Access 10, 30730\u201330742 (2022)","journal-title":"IEEE Access"}],"container-title":["Lecture Notes in Computer Science","Distributed Computing and Intelligent Technology"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-81404-4_15","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,31]],"date-time":"2024-12-31T01:04:07Z","timestamp":1735607047000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-81404-4_15"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,31]]},"ISBN":["9783031814037","9783031814044"],"references-count":18,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-81404-4_15","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,31]]},"assertion":[{"value":"31 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICDCIT","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Distributed Computing and Intelligent Technology","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Bhubaneswar","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 January 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 January 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdcit2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icdcit.ac.in","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}