{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T16:52:13Z","timestamp":1777654333582,"version":"3.51.4"},"publisher-location":"Cham","reference-count":79,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030585471","type":"print"},{"value":"9783030585488","type":"electronic"}],"license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020]]},"DOI":"10.1007\/978-3-030-58548-8_20","type":"book-chapter","created":{"date-parts":[[2020,10,28]],"date-time":"2020-10-28T23:02:42Z","timestamp":1603926162000},"page":"333-351","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":71,"title":["Learning Modality Interaction for Temporal Sentence Localization and Event Captioning in Videos"],"prefix":"10.1007","author":[{"given":"Shaoxiang","family":"Chen","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenhao","family":"Jiang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wei","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yu-Gang","family":"Jiang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2020,10,29]]},"reference":[{"key":"20_CR1","doi-asserted-by":"crossref","unstructured":"Aafaq, N., Akhtar, N., Liu, W., Gilani, S.Z., Mian, A.: Spatio-temporal dynamics and semantic attribute enriched visual encoding for video captioning. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.01277"},{"key":"20_CR2","doi-asserted-by":"crossref","unstructured":"Aytar, Y., Vondrick, C., Torralba, A.: SoundNet: learning sound representations from unlabeled video. In: NIPS (2016)","DOI":"10.1109\/CVPR.2016.18"},{"issue":"47","key":"20_CR3","doi-asserted-by":"publisher","first-page":"12260","DOI":"10.1523\/JNEUROSCI.1457-06.2006","volume":"26","author":"B Baier","year":"2006","unstructured":"Baier, B., Kleinschmidt, A., M\u00fcller, N.G.: Cross-modal processing in early visual and auditory cortices depends on expected statistical relationship of multisensory information. J. Neurosci. 26(47), 12260\u201312265 (2006)","journal-title":"J. Neurosci."},{"key":"20_CR4","doi-asserted-by":"crossref","unstructured":"Baraldi, L., Grana, C., Cucchiara, R.: Hierarchical boundary-aware neural encoder for video captioning. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.339"},{"issue":"12","key":"20_CR5","doi-asserted-by":"publisher","first-page":"1110","DOI":"10.1093\/cercor\/11.12.1110","volume":"11","author":"GA Calvert","year":"2001","unstructured":"Calvert, G.A.: Crossmodal processing in the human brain: insights from functional neuroimaging studies. Cereb. Cortex 11(12), 1110\u20131123 (2001)","journal-title":"Cereb. Cortex"},{"key":"20_CR6","unstructured":"Chen, D.L., Dolan, W.B.: Collecting highly parallel data for paraphrase evaluation. In: ACL (2011)"},{"key":"20_CR7","doi-asserted-by":"crossref","unstructured":"Chen, J., Pan, Y., Li, Y., Yao, T., Chao, H., Mei, T.: Temporal deformable convolutional encoder-decoder networks for video captioning. In: AAAI (2019)","DOI":"10.1609\/aaai.v33i01.33018167"},{"key":"20_CR8","doi-asserted-by":"crossref","unstructured":"Chen, J., Chen, X., Ma, L., Jie, Z., Chua, T.: Temporally grounding natural sentence in video. In: EMNLP (2018)","DOI":"10.18653\/v1\/D18-1015"},{"key":"20_CR9","doi-asserted-by":"crossref","unstructured":"Chen, S., Jiang, Y.: Motion guided spatial attention for video captioning. In: AAAI (2019)","DOI":"10.1609\/aaai.v33i01.33018191"},{"key":"20_CR10","doi-asserted-by":"crossref","unstructured":"Chen, S., Jiang, Y.: Semantic proposal for activity localization in videos via sentence query. In: AAAI (2019)","DOI":"10.1609\/aaai.v33i01.33018199"},{"key":"20_CR11","doi-asserted-by":"crossref","unstructured":"Chen, S., Chen, J., Jin, Q., Hauptmann, A.G.: Video captioning with guidance of multimodal latent topics. In: ACM MM (2017)","DOI":"10.1145\/3123266.3123420"},{"key":"20_CR12","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"367","DOI":"10.1007\/978-3-030-01261-8_22","volume-title":"Computer Vision \u2013 ECCV 2018","author":"Y Chen","year":"2018","unstructured":"Chen, Y., Wang, S., Zhang, W., Huang, Q.: Less is more: picking informative frames for video captioning. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11217, pp. 367\u2013384. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01261-8_22"},{"key":"20_CR13","unstructured":"Chung, J., G\u00fcl\u00e7ehre, \u00c7., Cho, K., Bengio, Y.: Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555 (2014)"},{"key":"20_CR14","doi-asserted-by":"crossref","unstructured":"Denkowski, M.J., Lavie, A.: Meteor universal: language specific translation evaluation for any target language. In: WMT@ACL (2014)","DOI":"10.3115\/v1\/W14-3348"},{"issue":"7","key":"20_CR15","doi-asserted-by":"publisher","first-page":"848","DOI":"10.1002\/hbm.20560","volume":"29","author":"MA Eckert","year":"2008","unstructured":"Eckert, M.A., Kamdar, N.V., Chang, C.E., Beckmann, C.F., Greicius, M.D., Menon, V.: A cross-modal system linking primary auditory and visual cortices: evidence from intrinsic fMRI connectivity analysis. Hum. Brain Mapp. 29(7), 848\u2013857 (2008)","journal-title":"Hum. Brain Mapp."},{"key":"20_CR16","doi-asserted-by":"crossref","unstructured":"Fukui, A., Park, D.H., Yang, D., Rohrbach, A., Darrell, T., Rohrbach, M.: Multimodal compact bilinear pooling for visual question answering and visual grounding. In: EMNLP (2016)","DOI":"10.18653\/v1\/D16-1044"},{"key":"20_CR17","doi-asserted-by":"crossref","unstructured":"Gao, J., Sun, C., Yang, Z., Nevatia, R.: TALL: temporal activity localization via language query. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.563"},{"key":"20_CR18","doi-asserted-by":"crossref","unstructured":"Gao, P., et al.: Dynamic fusion with intra- and inter-modality attention flow for visual question answering. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00680"},{"key":"20_CR19","doi-asserted-by":"crossref","unstructured":"Ge, R., Gao, J., Chen, K., Nevatia, R.: MAC: mining activity concepts for language-based temporal localization. In: WACV (2019)","DOI":"10.1109\/WACV.2019.00032"},{"key":"20_CR20","unstructured":"Hahn, M., Kadav, A., Rehg, J.M., Graf, H.P.: Tripping through time: efficient localization of activities in videos. arXiv preprint arXiv:1904.09936 (2019)"},{"key":"20_CR21","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"20_CR22","doi-asserted-by":"crossref","unstructured":"Heilbron, F.C., Escorcia, V., Ghanem, B., Niebles, J.C.: ActivityNet: a large-scale video benchmark for human activity understanding. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"20_CR23","doi-asserted-by":"crossref","unstructured":"Hendricks, L.A., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.C.: Localizing moments in video with natural language. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.618"},{"issue":"8","key":"20_CR24","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"20_CR25","doi-asserted-by":"crossref","unstructured":"Hori, C., et al.: Attention-based multimodal fusion for video description. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.450"},{"key":"20_CR26","doi-asserted-by":"crossref","unstructured":"Hu, Y., Chen, Z., Zha, Z., Wu, F.: Hierarchical global-local temporal modeling for video captioning. In: ACM MM (2019)","DOI":"10.1145\/3343031.3351072"},{"key":"20_CR27","doi-asserted-by":"crossref","unstructured":"Jiang, B., Huang, X., Yang, C., Yuan, J.: Cross-modal video moment retrieval with spatial and language-temporal attention. In: ICMR (2019)","DOI":"10.1145\/3323873.3325019"},{"key":"20_CR28","doi-asserted-by":"crossref","unstructured":"Jin, T., Huang, S., Li, Y., Zhang, Z.: Low-rank HOCA: efficient high-order cross-modal attention for video captioning. In: EMNLP-IJCNLP (2019)","DOI":"10.18653\/v1\/D19-1207"},{"key":"20_CR29","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Li, F.: Deep visual-semantic alignments for generating image descriptions. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"20_CR30","unstructured":"Kim, J., Jun, J., Zhang, B.: Bilinear attention networks. In: NeurIPS (2018)"},{"key":"20_CR31","unstructured":"Kim, J., On, K.W., Lim, W., Kim, J., Ha, J., Zhang, B.: Hadamard product for low-rank bilinear pooling. In: ICLR (2017)"},{"key":"20_CR32","doi-asserted-by":"crossref","unstructured":"Krishna, R., Hata, K., Ren, F., Fei-Fei, L., Niebles, J.C.: Dense-captioning events in videos. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.83"},{"key":"20_CR33","doi-asserted-by":"crossref","unstructured":"Li, X., Zhao, B., Lu, X.: MAM-RNN: multi-level attention model based RNN for video captioning. In: IJCAI (2017)","DOI":"10.24963\/ijcai.2017\/307"},{"key":"20_CR34","doi-asserted-by":"crossref","unstructured":"Liu, M., Wang, X., Nie, L., He, X., Chen, B., Chua, T.: Attentive moment retrieval in videos. In: ACM SIGIR (2018)","DOI":"10.1145\/3209978.3210003"},{"key":"20_CR35","doi-asserted-by":"crossref","unstructured":"Liu, Z., Shen, Y., Lakshminarasimhan, V.B., Liang, P.P., Zadeh, A., Morency, L.: Efficient low-rank multimodal fusion with modality-specific factors. In: ACL (2018)","DOI":"10.18653\/v1\/P18-1209"},{"key":"20_CR36","doi-asserted-by":"publisher","first-page":"173","DOI":"10.1162\/tacl_a_00013","volume":"6","author":"X Long","year":"2018","unstructured":"Long, X., Gan, C., de Melo, G.: Video captioning with multi-faceted attention. TACL 6, 173\u2013184 (2018)","journal-title":"TACL"},{"key":"20_CR37","doi-asserted-by":"crossref","unstructured":"Pan, P., Xu, Z., Yang, Y., Wu, F., Zhuang, Y.: Hierarchical recurrent neural encoder for video representation with application to captioning. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.117"},{"key":"20_CR38","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.: BLEU: a method for automatic evaluation of machine translation. In: ACL (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"20_CR39","doi-asserted-by":"crossref","unstructured":"Pei, W., Zhang, J., Wang, X., Ke, L., Shen, X., Tai, Y.: Memory-attended recurrent network for video captioning. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00854"},{"key":"20_CR40","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.D.: GloVe: global vectors for word representation. In: EMNLP (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"20_CR41","doi-asserted-by":"crossref","unstructured":"Rahman, T., Xu, B., Sigal, L.: Watch, listen and tell: multi-modal weakly supervised dense event captioning. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00900"},{"key":"20_CR42","doi-asserted-by":"crossref","unstructured":"Shaw, P., Uszkoreit, J., Vaswani, A.: Self-attention with relative position representations. In: NAACL-HLT (2018)","DOI":"10.18653\/v1\/N18-2074"},{"key":"20_CR43","doi-asserted-by":"crossref","unstructured":"Shen, Z., et al.: Weakly supervised dense video captioning. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.548"},{"key":"20_CR44","doi-asserted-by":"crossref","unstructured":"Shi, X., Cai, J., Joty, S.R., Gu, J.: Watch it twice: video captioning with a refocused video encoder. In: ACM MM (2019)","DOI":"10.1145\/3343031.3351060"},{"key":"20_CR45","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"510","DOI":"10.1007\/978-3-319-46448-0_31","volume-title":"Computer Vision \u2013 ECCV 2016","author":"GA Sigurdsson","year":"2016","unstructured":"Sigurdsson, G.A., Varol, G., Wang, X., Farhadi, A., Laptev, I., Gupta, A.: Hollywood in homes: crowdsourcing data collection for activity understanding. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 510\u2013526. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_31"},{"key":"20_CR46","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. In: ICLR (2015)"},{"key":"20_CR47","doi-asserted-by":"crossref","unstructured":"Song, J., Gao, L., Guo, Z., Liu, W., Zhang, D., Shen, H.T.: Hierarchical LSTM with adjusted temporal attention for video captioning. In: IJCAI (2017)","DOI":"10.24963\/ijcai.2017\/381"},{"key":"20_CR48","doi-asserted-by":"crossref","unstructured":"Song, W., et al.: AutoInt: automatic feature interaction learning via self-attentive neural networks. In: CIKM (2019)","DOI":"10.1145\/3357384.3357925"},{"key":"20_CR49","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"340","DOI":"10.1007\/978-3-030-00767-6_32","volume-title":"Advances in Multimedia Information Processing \u2013 PCM 2018","author":"X Song","year":"2018","unstructured":"Song, X., Han, Y.: VAL: visual-attention action localizer. In: Hong, R., Cheng, W.-H., Yamasaki, T., Wang, M., Ngo, C.-W. (eds.) PCM 2018. LNCS, vol. 11165, pp. 340\u2013350. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-00767-6_32"},{"key":"20_CR50","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Ioffe, S., Vanhoucke, V., Alemi, A.A.: Inception-v4, inception-ResNet and the impact of residual connections on learning. In: AAAI (2017)","DOI":"10.1609\/aaai.v31i1.11231"},{"key":"20_CR51","doi-asserted-by":"crossref","unstructured":"Szegedy, C., et al.: Going deeper with convolutions. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"20_CR52","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L.D., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3D convolutional networks. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"20_CR53","doi-asserted-by":"crossref","unstructured":"Tu, Y., Zhang, X., Liu, B., Yan, C.: Video description with spatial-temporal attention. In: ACM MM (2017)","DOI":"10.1145\/3123266.3123354"},{"key":"20_CR54","unstructured":"Ulyanov, D., Vedaldi, A., Lempitsky, V.S.: Instance normalization: the missing ingredient for fast stylization. arXiv preprint arXiv:1607.08022 (2016)"},{"key":"20_CR55","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NIPS (2017)"},{"key":"20_CR56","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Zitnick, C.L., Parikh, D.: CIDEr: consensus-based image description evaluation. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"20_CR57","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Rohrbach, M., Donahue, J., Mooney, R.J., Darrell, T., Saenko, K.: Sequence to sequence - video to text. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.515"},{"key":"20_CR58","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Xu, H., Donahue, J., Rohrbach, M., Mooney, R.J., Saenko, K.: Translating videos to natural language using deep recurrent neural networks. In: NAACL-HLT (2015)","DOI":"10.3115\/v1\/N15-1173"},{"key":"20_CR59","doi-asserted-by":"crossref","unstructured":"Wang, B., Ma, L., Zhang, W., Jiang, W., Wang, J., Liu, W.: Controllable video captioning with POS sequence guidance based on gated fusion network. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00273"},{"key":"20_CR60","doi-asserted-by":"crossref","unstructured":"Wang, B., Ma, L., Zhang, W., Liu, W.: Reconstruction network for video captioning. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00795"},{"key":"20_CR61","doi-asserted-by":"crossref","unstructured":"Wang, J., Jiang, W., Ma, L., Liu, W., Xu, Y.: Bidirectional attentive fusion with context gating for dense video captioning. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00751"},{"key":"20_CR62","doi-asserted-by":"crossref","unstructured":"Wang, J., Ma, L., Jiang, W.: Temporally grounding language queries in videos by contextual boundary-aware prediction. In: AAAI (2020)","DOI":"10.1609\/aaai.v34i07.6897"},{"key":"20_CR63","doi-asserted-by":"crossref","unstructured":"Wang, J., Wang, W., Huang, Y., Wang, L., Tan, T.: M3: multimodal memory modelling for video captioning. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00784"},{"key":"20_CR64","doi-asserted-by":"crossref","unstructured":"Wang, W., Huang, Y., Wang, L.: Language-driven temporal activity localization: a semantic matching reinforcement learning model. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00042"},{"key":"20_CR65","doi-asserted-by":"crossref","unstructured":"Wang, X., Girshick, R.B., Gupta, A., He, K.: Non-local neural networks. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00813"},{"key":"20_CR66","unstructured":"Xu, B., Wang, N., Chen, T., Li, M.: Empirical evaluation of rectified activations in convolutional network. arXiv preprint arXiv:1505.00853 (2015)"},{"key":"20_CR67","doi-asserted-by":"crossref","unstructured":"Xu, H., He, K., Plummer, B.A., Sigal, L., Sclaroff, S., Saenko, K.: Multilevel language and vision integration for text-to-clip retrieval. In: AAAI (2019)","DOI":"10.1609\/aaai.v33i01.33019062"},{"key":"20_CR68","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: MSR-VTT: a large video description dataset for bridging video and language. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"20_CR69","doi-asserted-by":"crossref","unstructured":"Xu, J., Yao, T., Zhang, Y., Mei, T.: Learning multimodal attention LSTM networks for video captioning. In: ACM MM (2017)","DOI":"10.1145\/3123266.3123448"},{"key":"20_CR70","doi-asserted-by":"crossref","unstructured":"Yang, Z., Han, Y., Wang, Z.: Catching the temporal regions-of-interest for video captioning. In: ACM MM (2017)","DOI":"10.1145\/3123266.3123327"},{"key":"20_CR71","doi-asserted-by":"crossref","unstructured":"Yao, L., et al.: Describing videos by exploiting temporal structure. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.512"},{"key":"20_CR72","doi-asserted-by":"crossref","unstructured":"Ye, L., Rochan, M., Liu, Z., Wang, Y.: Cross-modal self-attention network for referring image segmentation. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.01075"},{"key":"20_CR73","doi-asserted-by":"crossref","unstructured":"Yu, H., Wang, J., Huang, Z., Yang, Y., Xu, W.: Video paragraph captioning using hierarchical recurrent neural networks. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.496"},{"key":"20_CR74","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Mei, T., Zhu, W.: To find where you talk: temporal sentence localization in video with attention based location regression. In: AAAI (2019)","DOI":"10.1609\/aaai.v33i01.33019159"},{"key":"20_CR75","doi-asserted-by":"crossref","unstructured":"Zhang, J., Peng, Y.: Object-aware aggregation with bidirectional temporal graph for video captioning. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00852"},{"key":"20_CR76","doi-asserted-by":"crossref","unstructured":"Zhang, X., Gao, K., Zhang, Y., Zhang, D., Li, J., Tian, Q.: Task-driven dynamic fusion: reducing ambiguity in video description. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.662"},{"key":"20_CR77","doi-asserted-by":"crossref","unstructured":"Zhou, L., Zhou, Y., Corso, J.J., Socher, R., Xiong, C.: End-to-end dense video captioning with masked transformer. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00911"},{"key":"20_CR78","doi-asserted-by":"crossref","unstructured":"Zhu, L., Xu, Z., Yang, Y.: Bidirectional multirate reconstruction for temporal modeling in videos. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.147"},{"key":"20_CR79","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Jiang, S.: Attention-based densely connected LSTM for video captioning. In: ACM MM (2019)","DOI":"10.1145\/3343031.3350932"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2020"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-58548-8_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T00:11:16Z","timestamp":1730160676000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-58548-8_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"ISBN":["9783030585471","9783030585488"],"references-count":79,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-58548-8_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020]]},"assertion":[{"value":"29 October 2020","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Glasgow","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"United Kingdom","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2020","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 August 2020","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2020","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2020","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2020.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"OpenReview","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5025","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1360","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"27% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"7","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"The conference was held virtually due to the COVID-19 pandemic.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}