{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T07:34:37Z","timestamp":1767339277296,"version":"3.40.3"},"publisher-location":"Cham","reference-count":44,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031200588"},{"type":"electronic","value":"9783031200595"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-20059-5_21","type":"book-chapter","created":{"date-parts":[[2022,10,28]],"date-time":"2022-10-28T16:02:50Z","timestamp":1666972970000},"page":"363-379","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":17,"title":["Unifying Event Detection and\u00a0Captioning as\u00a0Sequence Generation via\u00a0Pre-training"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5902-1955","authenticated-orcid":false,"given":"Qi","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Yuqing","family":"Song","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6486-6020","authenticated-orcid":false,"given":"Qin","family":"Jin","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,10,29]]},"reference":[{"key":"21_CR1","doi-asserted-by":"crossref","unstructured":"Agrawal, H., et al.: NoCaps: novel object captioning at scale. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2019)","DOI":"10.1109\/ICCV.2019.00904"},{"key":"21_CR2","unstructured":"Banerjee, S., Lavie, A.: METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for machine translation and\/or Summarization, pp. 65\u201372 (2005)"},{"key":"21_CR3","doi-asserted-by":"crossref","unstructured":"Buch, S., Escorcia, V., Shen, C., Ghanem, B., Carlos Niebles, J.: SST: single-stream temporal action proposals. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2017)","DOI":"10.1109\/CVPR.2017.675"},{"key":"21_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-End object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"21_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1007\/978-3-030-58577-8_7","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Y-C Chen","year":"2020","unstructured":"Chen, Y.-C., et al.: UNITER: universal image-text representation learning. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 104\u2013120. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_7"},{"key":"21_CR6","unstructured":"Cho, J., Lei, J., Tan, H., Bansal, M.: Unifying vision-and-language tasks via text generation. In: International Conference on Machine Learning (2021)"},{"key":"21_CR7","doi-asserted-by":"crossref","unstructured":"Cornia, M., Stefanini, M., Baraldi, L., Cucchiara, R.: Meshed-memory transformer for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10578\u201310587 (2020)","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"21_CR8","doi-asserted-by":"crossref","unstructured":"Deng, C., Chen, S., Chen, D., He, Y., Wu, Q.: Sketch, ground, and refine: Top-down dense video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition., pp. 234\u2013243 (2021)","DOI":"10.1109\/CVPR46437.2021.00030"},{"key":"21_CR9","unstructured":"Devlin, J., Chang, M., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics, pp. 4171\u20134186 (2019)"},{"key":"21_CR10","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"768","DOI":"10.1007\/978-3-319-46487-9_47","volume-title":"Computer Vision \u2013 ECCV 2016","author":"V Escorcia","year":"2016","unstructured":"Escorcia, V., Caba Heilbron, F., Niebles, J.C., Ghanem, B.: DAPs: deep action proposals for action understanding. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9907, pp. 768\u2013784. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46487-9_47"},{"key":"21_CR11","unstructured":"Faghri, F., Fleet, D.J., Kiros, J.R., Fidler, S.: VSE++: improving visual-semantic embeddings with hard negatives. In: British Machine Vision Conference, p. 12 (2018)"},{"key":"21_CR12","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"517","DOI":"10.1007\/978-3-030-58539-6_31","volume-title":"Computer Vision \u2013 ECCV 2020","author":"S Fujita","year":"2020","unstructured":"Fujita, S., Hirao, T., Kamigaito, H., Okumura, M., Nagata, M.: SODA: story oriented dense video captioning evaluation framework. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12351, pp. 517\u2013531. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58539-6_31"},{"key":"21_CR13","doi-asserted-by":"crossref","unstructured":"Gao, J., Yang, Z., Chen, K., Sun, C., Nevatia, R.: Turn tap: temporal unit regression network for temporal action proposals. In: Proceedings of the IEEE International Conference on Computer Vision (2017)","DOI":"10.1109\/ICCV.2017.392"},{"key":"21_CR14","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., Parikh, D.: Making the V in VQA matter: elevating the role of image understanding in visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2017)","DOI":"10.1109\/CVPR.2017.670"},{"key":"21_CR15","doi-asserted-by":"crossref","unstructured":"Graves, A., Schmidhuber, J.: Framewise phoneme classification with bidirectional LSTM and other neural network architectures. Neural Netw. 18(5), 602\u2013610 (2005), iJCNN 2005","DOI":"10.1016\/j.neunet.2005.06.042"},{"key":"21_CR16","doi-asserted-by":"crossref","unstructured":"Heilbron, F.C., Niebles, J.C., Ghanem, B.: Fast temporal activity proposals for efficient detection of human actions in untrimmed videos. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2016)","DOI":"10.1109\/CVPR.2016.211"},{"key":"21_CR17","doi-asserted-by":"crossref","unstructured":"Huang, L., Wang, W., Chen, J., Wei, X.Y.: Attention on attention for image captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4634\u20134643 (2019)","DOI":"10.1109\/ICCV.2019.00473"},{"key":"21_CR18","doi-asserted-by":"crossref","unstructured":"Huang, Z., Zeng, Z., Huang, Y., Liu, B., Fu, D., Fu, J.: Seeing out of the box: end-to-end pre-training for vision-language representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12976\u201312985 (2021)","DOI":"10.1109\/CVPR46437.2021.01278"},{"key":"21_CR19","doi-asserted-by":"crossref","unstructured":"Iashin, V., Rahtu, E.: A better use of audio-visual cues: Dense video captioning with bi-modal transformer. In: British Machine Vision Conference. https:\/\/arxiv.org\/abs\/2005.08271(2020)","DOI":"10.1109\/CVPRW50498.2020.00487"},{"key":"21_CR20","doi-asserted-by":"crossref","unstructured":"Iashin, V., Rahtu, E.: Multi-modal dense video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops (2020)","DOI":"10.1109\/CVPRW50498.2020.00487"},{"key":"21_CR21","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: Bengio, Y., LeCun, Y. (eds.) International Conference on Learning Representations (2015)"},{"key":"21_CR22","doi-asserted-by":"crossref","unstructured":"Krishna, R., Hata, K., Ren, F., Fei-Fei, L., Carlos Niebles, J.: Dense-captioning events in videos. In: Proceedings of the IEEE International Conference on Computer Vision (2017)","DOI":"10.1109\/ICCV.2017.83"},{"key":"21_CR23","doi-asserted-by":"crossref","unstructured":"Li, G., Duan, N., Fang, Y., Gong, M., Jiang, D.: Unicoder-vl: A universal encoder for vision and language by cross-modal pre-training. In: Proceedings of the AAAI Conference on Artificial Intelligence. pp. 11336\u201311344 (2020)","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"21_CR24","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/978-3-030-58577-8_8","volume-title":"Computer Vision \u2013 ECCV 2020","author":"X Li","year":"2020","unstructured":"Li, X., et al.: Oscar: object-semantics aligned pre-training for vision-language tasks. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 121\u2013137. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_8"},{"key":"21_CR25","doi-asserted-by":"crossref","unstructured":"Li, Y., Yao, T., Pan, Y., Chao, H., Mei, T.: Jointly localizing and describing events for dense video captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2018)","DOI":"10.1109\/CVPR.2018.00782"},{"key":"21_CR26","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: TGIF: a new dataset and benchmark on animated gif description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2016)","DOI":"10.1109\/CVPR.2016.502"},{"key":"21_CR27","doi-asserted-by":"crossref","unstructured":"Mun, J., Yang, L., Ren, Z., Xu, N., Han, B.: Streamlined dense video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6588\u20136597 (2019)","DOI":"10.1109\/CVPR.2019.00675"},{"key":"21_CR28","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"21_CR29","doi-asserted-by":"crossref","unstructured":"Shou, Z., Wang, D., Chang, S.F.: Temporal action localization in untrimmed videos via multi-stage CNNs. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2016)","DOI":"10.1109\/CVPR.2016.119"},{"key":"21_CR30","unstructured":"Su, W., et al.: VL-BERT: pre-training of generic visual-linguistic representations. In: International Conference on Learning Representations (2020)"},{"key":"21_CR31","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE International Conference on Computer Vision (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"21_CR32","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, pp. 5998\u20136008 (2017)"},{"key":"21_CR33","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence Zitnick, C., Parikh, D.: Cider: consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"21_CR34","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator. In: Conference on Computer Vision and Pattern Recognition, pp. 3156\u20133164 (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"21_CR35","doi-asserted-by":"crossref","unstructured":"Wang, J., Jiang, W., Ma, L., Liu, W., Xu, Y.: Bidirectional attentive fusion with context gating for dense video captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7190\u20137198 (2018)","DOI":"10.1109\/CVPR.2018.00751"},{"issue":"11","key":"21_CR36","doi-asserted-by":"publisher","first-page":"2740","DOI":"10.1109\/TPAMI.2018.2868668","volume":"41","author":"L Wang","year":"2018","unstructured":"Wang, L., et al.: Temporal segment networks for action recognition in videos. IEEE Trans. Pattern Anal. Mach. Intell. 41(11), 2740\u20132755 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"21_CR37","doi-asserted-by":"crossref","unstructured":"Wang, T., Zhang, R., Lu, Z., Zheng, F., Cheng, R., Luo, P.: End-to-end dense video captioning with parallel decoding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6847\u20136857 (2021)","DOI":"10.1109\/ICCV48922.2021.00677"},{"issue":"5","key":"21_CR38","doi-asserted-by":"publisher","first-page":"1890","DOI":"10.1109\/TCSVT.2020.3014606","volume":"31","author":"T Wang","year":"2020","unstructured":"Wang, T., Zheng, H., Yu, M., Tian, Q., Hu, H.: Event-centric hierarchical representation for dense video captioning. IEEE Trans. Circuits Syst. Video Technol. 31(5), 1890\u20131900 (2020)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"21_CR39","doi-asserted-by":"crossref","unstructured":"Wang, X., Wu, J., Chen, J., Li, L., Wang, Y.F., Wang, W.Y.: Vatex: A large-scale, high-quality multilingual dataset for video-and-language research. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4581\u20134591 (2019)","DOI":"10.1109\/ICCV.2019.00468"},{"key":"21_CR40","doi-asserted-by":"crossref","unstructured":"Xiong, Y., Dai, B., Lin, D.: Move forward and tell: a progressive generator of video descriptions. In: Proceedings of the European Conference on Computer Vision, pp. 468\u2013483 (2018)","DOI":"10.1007\/978-3-030-01252-6_29"},{"key":"21_CR41","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: MSR-VTT: a large video description dataset for bridging video and language. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5288\u20135296 (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"21_CR42","doi-asserted-by":"crossref","unstructured":"Yang, D., Yuan, C.: Hierarchical context encoding for events captioning in videos. In: International Conference on Image Processing, pp. 1288\u20131292 (2018)","DOI":"10.1109\/ICIP.2018.8451740"},{"key":"21_CR43","doi-asserted-by":"crossref","unstructured":"Zhou, L., Zhou, Y., Corso, J.J., Socher, R., Xiong, C.: End-to-end dense video captioning with masked transformer. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2018)","DOI":"10.1109\/CVPR.2018.00911"},{"key":"21_CR44","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable DETR: deformable transformers for end-to-end object detection. In: International Conference on Learning Representations (2021)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-20059-5_21","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,28]],"date-time":"2022-10-28T16:10:46Z","timestamp":1666973446000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-20059-5_21"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031200588","9783031200595"],"references-count":44,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-20059-5_21","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"29 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}