{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,28]],"date-time":"2025-03-28T03:24:02Z","timestamp":1743132242142,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":27,"publisher":"Springer Singapore","isbn-type":[{"type":"print","value":"9789811611025"},{"type":"electronic","value":"9789811611032"}],"license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-981-16-1103-2_39","type":"book-chapter","created":{"date-parts":[[2021,3,25]],"date-time":"2021-03-25T08:03:32Z","timestamp":1616659412000},"page":"465-477","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["A Novel Approach for Video Captioning Based on Semantic Cross Embedding and Skip-Connection"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7548-9205","authenticated-orcid":false,"given":"Rakesh","family":"Radarapu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8328-2116","authenticated-orcid":false,"given":"Nishanth","family":"Bandari","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6233-1129","authenticated-orcid":false,"given":"Satwik","family":"Muthyam","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8989-6282","authenticated-orcid":false,"given":"Dinesh","family":"Naik","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,3,26]]},"reference":[{"key":"39_CR1","doi-asserted-by":"crossref","unstructured":"Baraldi, L., Grana, C., Cucchiara, R.: Hierarchical boundary-aware neural encoder for video captioning. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3185\u20133194 (2017)","DOI":"10.1109\/CVPR.2017.339"},{"issue":"7","key":"39_CR2","doi-asserted-by":"publisher","first-page":"2631","DOI":"10.1109\/TCYB.2018.2831447","volume":"49","author":"Y Bin","year":"2019","unstructured":"Bin, Y., Yang, Y., Shen, F., Xie, N., Shen, H.T., Li, X.: Describing video with attention-based bidirectional LSTM. IEEE Trans. Cybern. 49(7), 2631\u20132641 (2019)","journal-title":"IEEE Trans. Cybern."},{"key":"39_CR3","unstructured":"Devlin, J., Chang, M., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. CoRR abs\/1810.04805 (2018)"},{"key":"39_CR4","doi-asserted-by":"crossref","unstructured":"Fang, H., et al.: From captions to visual concepts and back. In: 2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1473\u20131482 (2015)","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"39_CR5","doi-asserted-by":"crossref","unstructured":"Gan, Z., et al.: Semantic compositional networks for visual captioning. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1141\u20131150 (2017)","DOI":"10.1109\/CVPR.2017.127"},{"issue":"9","key":"39_CR6","doi-asserted-by":"publisher","first-page":"2045","DOI":"10.1109\/TMM.2017.2729019","volume":"19","author":"L Gao","year":"2017","unstructured":"Gao, L., Guo, Z., Zhang, H., Xu, X., Shen, H.T.: Video captioning with attention-based LSTM and semantic consistency. IEEE Trans. Multimed. 19(9), 2045\u20132055 (2017)","journal-title":"IEEE Trans. Multimed."},{"key":"39_CR7","doi-asserted-by":"crossref","unstructured":"Hao, W., Zhang, Z., Guan, H., Zhu, G.: Integrating both visual and audio cues for enhanced video caption (2017)","DOI":"10.1609\/aaai.v32i1.12330"},{"key":"39_CR8","doi-asserted-by":"publisher","unstructured":"Li, G., Ma, S., Han, Y.: Summarization-based video caption via deep neural networks. In: Proceedings of the 23rd ACM International Conference on Multimedia, MM 2015, pp. 1191\u20131194. Association for Computing Machinery, New York (2015). https:\/\/doi.org\/10.1145\/2733373.2806314","DOI":"10.1145\/2733373.2806314"},{"key":"39_CR9","doi-asserted-by":"crossref","unstructured":"Pan, Y., Yao, T., Li, H., Mei, T.: Video captioning with transferred semantic attributes. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 984\u2013992 (2017)","DOI":"10.1109\/CVPR.2017.111"},{"key":"39_CR10","doi-asserted-by":"crossref","unstructured":"Pei, W., Zhang, J., Wang, X., Ke, L., Shen, X., Tai, Y.: Memory-attended recurrent network for video captioning. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 8339\u20138348 (2019)","DOI":"10.1109\/CVPR.2019.00854"},{"key":"39_CR11","doi-asserted-by":"crossref","unstructured":"Rennie, S.J., Marcheret, E., Mroueh, Y., Ross, J., Goel, V.: Self-critical sequence training for image captioning. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1179\u20131195 (2017)","DOI":"10.1109\/CVPR.2017.131"},{"key":"39_CR12","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition (2015)"},{"issue":"10","key":"39_CR13","doi-asserted-by":"publisher","first-page":"3047","DOI":"10.1109\/TNNLS.2018.2851077","volume":"30","author":"J Song","year":"2019","unstructured":"Song, J., Guo, Y., Gao, L., Li, X., Hanjalic, A., Shen, H.T.: From deterministic to generative: multimodal stochastic RNNs for video captioning. IEEE Trans. Neural Netw. Learn. Syst. 30(10), 3047\u20133058 (2019)","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"39_CR14","unstructured":"Song, J., Li, X., Gao, L., Shen, H.T.: Hierarchical LSTMs with adaptive attention for visual captioning. CoRR abs\/1812.11004 (2018)"},{"key":"39_CR15","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z.: Rethinking the inception architecture for computer vision (2015)","DOI":"10.1109\/CVPR.2016.308"},{"key":"39_CR16","unstructured":"Vaswani, A., et al.: Attention is all you need. CoRR abs\/1706.03762 (2017)"},{"key":"39_CR17","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Rohrbach, M., Donahue, J., Mooney, R., Darrell, T., Saenko, K.: Sequence to sequence - video to text. In: 2015 IEEE International Conference on Computer Vision (ICCV), pp. 4534\u20134542 (2015)","DOI":"10.1109\/ICCV.2015.515"},{"key":"39_CR18","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator. CoRR abs\/1411.4555 (2014)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"39_CR19","doi-asserted-by":"crossref","unstructured":"Wang, J., Wang, W., Huang, Y., Wang, L., Tan, T.: M3: multimodal memory modelling for video captioning. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7512\u20137520 (2018)","DOI":"10.1109\/CVPR.2018.00784"},{"key":"39_CR20","unstructured":"Xu, H., Venugopalan, S., Ramanishka, V., Rohrbach, M., Saenko, K.: A multi-scale multiple instance video description network (2015)"},{"issue":"5","key":"39_CR21","doi-asserted-by":"publisher","first-page":"3419","DOI":"10.1109\/JIOT.2017.2779865","volume":"5","author":"N Xu","year":"2018","unstructured":"Xu, N., Liu, A., Nie, W., Su, Y.: Attention-in-attention networks for surveillance video understanding in internet of things. IEEE Internet Things J. 5(5), 3419\u20133429 (2018)","journal-title":"IEEE Internet Things J."},{"issue":"10","key":"39_CR22","doi-asserted-by":"publisher","first-page":"4933","DOI":"10.1109\/TIP.2018.2846664","volume":"27","author":"Y Xu","year":"2018","unstructured":"Xu, Y., Han, Y., Hong, R., Tian, Q.: Sequential video VLAD: training the aggregation locally and temporally. IEEE Trans. Image Process. 27(10), 4933\u20134944 (2018)","journal-title":"IEEE Trans. Image Process."},{"key":"39_CR23","doi-asserted-by":"crossref","unstructured":"Yang, Y., et al.: Video captioning by adversarial LSTM. IEEE Trans. Image Process. 27(11), 5600\u20135611 (2018)","DOI":"10.1109\/TIP.2018.2855422"},{"key":"39_CR24","doi-asserted-by":"crossref","unstructured":"Yao, L., et al.: Describing videos by exploiting temporal structure. In: 2015 IEEE International Conference on Computer Vision (ICCV), pp. 4507\u20134515 (2015)","DOI":"10.1109\/ICCV.2015.512"},{"key":"39_CR25","doi-asserted-by":"crossref","unstructured":"You, Q., Jin, H., Wang, Z., Fang, C., Luo, J.: Image captioning with semantic attention. CoRR abs\/1603.03925 (2016)","DOI":"10.1109\/CVPR.2016.503"},{"key":"39_CR26","doi-asserted-by":"crossref","unstructured":"Zheng, Q., Wang, C., Tao, D.: Syntax-aware action targeting for video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), June 2020","DOI":"10.1109\/CVPR42600.2020.01311"},{"key":"39_CR27","doi-asserted-by":"crossref","unstructured":"Zoph, B., Vasudevan, V., Shlens, J., Le, Q.V.: Learning transferable architectures for scalable image recognition. CoRR abs\/1707.07012 (2017)","DOI":"10.1109\/CVPR.2018.00907"}],"container-title":["Communications in Computer and Information Science","Computer Vision and Image Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-16-1103-2_39","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,12,22]],"date-time":"2022-12-22T11:52:15Z","timestamp":1671709935000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-16-1103-2_39"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"ISBN":["9789811611025","9789811611032"],"references-count":27,"URL":"https:\/\/doi.org\/10.1007\/978-981-16-1103-2_39","relation":{},"ISSN":["1865-0929","1865-0937"],"issn-type":[{"type":"print","value":"1865-0929"},{"type":"electronic","value":"1865-0937"}],"subject":[],"published":{"date-parts":[[2021]]},"assertion":[{"value":"26 March 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"CVIP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Computer Vision and Image Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Prayagraj","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2020","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 December 2020","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"6 December 2020","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"cvip2020","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/cvip2020.iiita.ac.in","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"352","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"134","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"38% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Due to the COVID-19 pandemic the conference was partially held in a virtual mode.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}