{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T05:26:05Z","timestamp":1781587565442,"version":"3.54.5"},"publisher-location":"Cham","reference-count":62,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031197802","type":"print"},{"value":"9783031197819","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19781-9_29","type":"book-chapter","created":{"date-parts":[[2022,10,22]],"date-time":"2022-10-22T12:12:59Z","timestamp":1666440779000},"page":"497-514","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Audio-Visual Mismatch-Aware Video Retrieval via\u00a0Association and\u00a0Adjustment"],"prefix":"10.1007","author":[{"given":"Sangmin","family":"Lee","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Sungjune","family":"Park","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yong Man","family":"Ro","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2022,10,23]]},"reference":[{"key":"29_CR1","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: Frozen in time: A joint video and image encoder for end-to-end retrieval. In: IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 1728\u20131738 (2021)","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"29_CR2","doi-asserted-by":"crossref","unstructured":"Cai, Q., Pan, Y., Yao, T., Yan, C., Mei, T.: Memory matching networks for one-shot image recognition. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4080\u20134088 (2018)","DOI":"10.1109\/CVPR.2018.00429"},{"key":"29_CR3","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? a new model and the kinetics dataset. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"29_CR4","doi-asserted-by":"crossref","unstructured":"Chen, J., Hu, H., Wu, H., Jiang, Y., Wang, C.: Learning the best pooling strategy for visual semantic embedding. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 15789\u201315798 (2021)","DOI":"10.1109\/CVPR46437.2021.01553"},{"key":"29_CR5","doi-asserted-by":"crossref","unstructured":"Chen, S., Zhao, Y., Jin, Q., Wu, Q.: Fine-grained video-text retrieval with hierarchical graph reasoning. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10638\u201310647 (2020)","DOI":"10.1109\/CVPR42600.2020.01065"},{"key":"29_CR6","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning (ICML), pp. 1597\u20131607. PMLR (2020)"},{"key":"29_CR7","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: Pre-training of deep bidirectional transformers for language understanding. In: Conference of the North American Chapter of the Association for Computational Linguistics (NAACL-HLT) (2019)"},{"issue":"12","key":"29_CR8","doi-asserted-by":"publisher","first-page":"3377","DOI":"10.1109\/TMM.2018.2832602","volume":"20","author":"J Dong","year":"2018","unstructured":"Dong, J., Li, X., Snoek, C.G.: Predicting visual features from text for image and video caption retrieval. IEEE Trans. Multimedia 20(12), 3377\u20133388 (2018)","journal-title":"IEEE Trans. Multimedia"},{"key":"29_CR9","doi-asserted-by":"crossref","unstructured":"Dong, J., et al.: Dual encoding for zero-example video retrieval. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 9346\u20139355 (2019)","DOI":"10.1109\/CVPR.2019.00957"},{"key":"29_CR10","first-page":"4065","volume":"44","author":"J Dong","year":"2021","unstructured":"Dong, J., et al.: Dual encoding for video retrieval by text. IEEE Trans. Pattern Anal. Mach. Intell. 44, 4065\u20134080 (2021)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"29_CR11","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: Transformers for image recognition at scale. In: International Conference on Learning Representations (ICLR) (2020)"},{"key":"29_CR12","doi-asserted-by":"crossref","unstructured":"Dzabraev, M., Kalashnikov, M., Komkov, S., Petiushko, A.: Mdmmt: Multidomain multimodal transformer for video retrieval. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3354\u20133363 (2021)","DOI":"10.1109\/CVPRW53098.2021.00374"},{"key":"29_CR13","unstructured":"Faghri, F., Fleet, D.J., Kiros, J.R., Fidler, S.: Vse++: Improving visual-semantic embeddings with hard negatives. In: British Machine Vision Conference (BMVC) (2018)"},{"key":"29_CR14","doi-asserted-by":"crossref","unstructured":"Francis, D., Anh Nguyen, P., Huet, B., Ngo, C.W.: Fusion of multimodal embeddings for ad-hoc video search. In: IEEE\/CVF International Conference on Computer Vision Workshops (ICCVW) (2019)","DOI":"10.1109\/ICCVW.2019.00233"},{"key":"29_CR15","doi-asserted-by":"crossref","unstructured":"Fu, Z., Liu, Q., Fu, Z., Wang, Y.: Stmtrack: Template-free visual tracking with space-time memory networks. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 13774\u201313783 (2021)","DOI":"10.1109\/CVPR46437.2021.01356"},{"key":"29_CR16","doi-asserted-by":"crossref","unstructured":"Gabeur, V., Nagrani, A., Sun, C., Alahari, K., Schmid, C.: Masking modalities for cross-modal video retrieval. In: IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV), pp. 1766\u20131775 (2022)","DOI":"10.1109\/WACV51458.2022.00217"},{"key":"29_CR17","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"214","DOI":"10.1007\/978-3-030-58548-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"V Gabeur","year":"2020","unstructured":"Gabeur, V., Sun, C., Alahari, K., Schmid, C.: Multi-modal transformer for video retrieval. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12349, pp. 214\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58548-8_13"},{"key":"29_CR18","doi-asserted-by":"crossref","unstructured":"Gong, D., et al.: Memorizing normality to detect anomaly: Memory-augmented deep autoencoder for unsupervised anomaly detection. In: IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 1705\u20131714 (2019)","DOI":"10.1109\/ICCV.2019.00179"},{"key":"29_CR19","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"312","DOI":"10.1007\/978-3-030-58580-8_19","volume-title":"Computer Vision \u2013 ECCV 2020","author":"T Han","year":"2020","unstructured":"Han, T., Xie, W., Zisserman, A.: Memory-augmented dense predictive coding for video representation learning. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12348, pp. 312\u2013329. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58580-8_19"},{"key":"29_CR20","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"29_CR21","doi-asserted-by":"crossref","unstructured":"Hu, P., Peng, X., Zhu, H., Zhen, L., Lin, J.: Learning cross-modal retrieval with noisy labels. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5403\u20135413 (2021)","DOI":"10.1109\/CVPR46437.2021.00536"},{"key":"29_CR22","doi-asserted-by":"crossref","unstructured":"Huang, Y., Wang, L.: Acmm: Aligned cross-modal memory for few-shot image and sentence matching. In: IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 5774\u20135783 (2019)","DOI":"10.1109\/ICCV.2019.00587"},{"key":"29_CR23","unstructured":"Kaiser, \u0141., Nachum, O., Roy, A., Bengio, S.: Learning to remember rare events. In: International Conference on Learning Representations (ICLR) (2017)"},{"key":"29_CR24","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3128\u20133137 (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"29_CR25","doi-asserted-by":"crossref","unstructured":"Kim, J.U., Park, S., Ro, Y.M.: Robust small-scale pedestrian detection with cued recall via memory learning. In: IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 3050\u20133059 (2021)","DOI":"10.1109\/ICCV48922.2021.00304"},{"key":"29_CR26","doi-asserted-by":"crossref","unstructured":"Kim, M., Hong, J., Park, S.J., Ro, Y.M.: Multi-modality associative bridging through memory: Speech sound recollected from face video. In: IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 296\u2013306 (2021)","DOI":"10.1109\/ICCV48922.2021.00036"},{"key":"29_CR27","unstructured":"Kingma, D., Ba, J.: Adam: A method for stochastic optimization. In: International Conference on Learning Representations (ICLR) (2015)"},{"key":"29_CR28","unstructured":"Kiros, R., Salakhutdinov, R., Zemel, R.S.: Unifying visual-semantic embeddings with multimodal neural language models. arXiv preprint arXiv:1411.2539 (2014)"},{"key":"29_CR29","doi-asserted-by":"crossref","unstructured":"Lee, S., Kim, H.G., Choi, D.H., Kim, H.I., Ro, Y.M.: Video prediction recalling long-term motion context via memory alignment learning. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3054\u20133063 (2021)","DOI":"10.1109\/CVPR46437.2021.00307"},{"key":"29_CR30","doi-asserted-by":"crossref","unstructured":"Lee, S., Kim, H.I., Ro, Y.M.: Weakly paired associative learning for sound and image representations via bimodal associative memory. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10534\u201310543 (2022)","DOI":"10.1109\/CVPR52688.2022.01028"},{"key":"29_CR31","doi-asserted-by":"crossref","unstructured":"Li, X., Xu, C., Yang, G., Chen, Z., Dong, J.: W2VV++: fully deep learning for ad-hoc video search. In: ACM International Conference on Multimedia (ACM MM), pp. 1786\u20131794 (2019)","DOI":"10.1145\/3343031.3350906"},{"key":"29_CR32","doi-asserted-by":"publisher","first-page":"4351","DOI":"10.1109\/TMM.2020.3042067","volume":"23","author":"X Li","year":"2021","unstructured":"Li, X., Zhou, F., Xu, C., Ji, J., Yang, G.: Sea: Sentence encoder assembly for video retrieval by textual queries. IEEE Trans. Multimedia 23, 4351\u20134362 (2021)","journal-title":"IEEE Trans. Multimedia"},{"key":"29_CR33","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Tgif: A new dataset and benchmark on animated gif description. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4641\u20134650 (2016)","DOI":"10.1109\/CVPR.2016.502"},{"key":"29_CR34","doi-asserted-by":"crossref","unstructured":"Liu, H., Luo, R., Shang, F., Niu, M., Liu, Y.: Progressive semantic matching for video-text retrieval. In: ACM International Conference on Multimedia (ACM MM), pp. 5083\u20135091 (2021)","DOI":"10.1145\/3474085.3475621"},{"key":"29_CR35","doi-asserted-by":"crossref","unstructured":"Liu, S., Fan, H., Qian, S., Chen, Y., Ding, W., Wang, Z.: Hit: Hierarchical transformer with momentum contrast for video-text retrieval. In: IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 11915\u201311925 (2021)","DOI":"10.1109\/ICCV48922.2021.01170"},{"key":"29_CR36","unstructured":"Liu, Y., Albanie, S., Nagrani, A., Zisserman, A.: Use what you have: Video retrieval using representations from collaborative experts. In: British Machine Vision Conference (BMVC) (2019)"},{"key":"29_CR37","doi-asserted-by":"crossref","unstructured":"Marchetti, F., Becattini, F., Seidenari, L., Bimbo, A.D.: Mantra: Memory augmented networks for multiple trajectory prediction. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 7143\u20137152 (2020)","DOI":"10.1109\/CVPR42600.2020.00717"},{"key":"29_CR38","doi-asserted-by":"crossref","unstructured":"Miech, A., Alayrac, J.B., Laptev, I., Sivic, J., Zisserman, A.: Thinking fast and slow: Efficient text-to-visual retrieval with transformers. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 9826\u20139836 (2021)","DOI":"10.1109\/CVPR46437.2021.00970"},{"key":"29_CR39","doi-asserted-by":"crossref","unstructured":"Miech, A., Alayrac, J.B., Smaira, L., Laptev, I., Sivic, J., Zisserman, A.: End-to-end learning of visual representations from uncurated instructional videos. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 9879\u20139889 (2020)","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"29_CR40","unstructured":"Miech, A., Laptev, I., Sivic, J.: Learning a text-video embedding from incomplete and heterogeneous data. arXiv preprint arXiv:1804.02516 (2018)"},{"key":"29_CR41","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J.B., Tapaswi, M., Laptev, I., Sivic, J.: Howto100m: Learning a text-video embedding by watching hundred million narrated video clips. In: IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 2630\u20132640 (2019)","DOI":"10.1109\/ICCV.2019.00272"},{"key":"29_CR42","doi-asserted-by":"crossref","unstructured":"Mithun, N.C., Li, J., Metze, F., Roy-Chowdhury, A.K.: Learning joint embedding with multimodal cues for cross-modal video-text retrieval. In: ACM International Conference on Multimedia Retrieval (ICMR), pp. 19\u201327 (2018)","DOI":"10.1145\/3206025.3206064"},{"key":"29_CR43","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"651","DOI":"10.1007\/978-3-319-46604-0_46","volume-title":"Computer Vision \u2013 ECCV 2016 Workshops","author":"M Otani","year":"2016","unstructured":"Otani, M., Nakashima, Y., Rahtu, E., Heikkil\u00e4, J., Yokoya, N.: Learning joint representations of videos and sentences with web image search. In: Hua, G., J\u00e9gou, H. (eds.) ECCV 2016. LNCS, vol. 9913, pp. 651\u2013667. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46604-0_46"},{"key":"29_CR44","doi-asserted-by":"crossref","unstructured":"Park, H., Noh, J., Ham, B.: Learning memory-guided normality for anomaly detection. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 14372\u201314381 (2020)","DOI":"10.1109\/CVPR42600.2020.01438"},{"key":"29_CR45","unstructured":"Patrick, M., et al.: Support-set bottlenecks for video-text representation learning. In: International Conference on Learning Representations (ICLR) (2020)"},{"key":"29_CR46","doi-asserted-by":"crossref","unstructured":"Song, Y., Soleymani, M.: Polysemous visual-semantic embedding for cross-modal retrieval. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1979\u20131988 (2019)","DOI":"10.1109\/CVPR.2019.00208"},{"key":"29_CR47","unstructured":"Torabi, A., Tandon, N., Sigal, L.: Learning language-visual embedding for movie understanding with natural-language. arXiv preprint arXiv:1609.08124 (2016)"},{"key":"29_CR48","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems (NeurIPS), pp. 5998\u20136008 (2017)"},{"key":"29_CR49","doi-asserted-by":"publisher","first-page":"2386","DOI":"10.1109\/TMM.2020.3011288","volume":"23","author":"W Wang","year":"2021","unstructured":"Wang, W., Gao, J., Yang, X., Xu, C.: Learning coarse-to-fine graph neural networks for video-text retrieval. IEEE Trans. Multimedia 23, 2386\u20132397 (2021)","journal-title":"IEEE Trans. Multimedia"},{"key":"29_CR50","doi-asserted-by":"crossref","unstructured":"Wang, X., Zhu, L., Yang, Y.: T2vlad: global-local sequence alignment for text-video retrieval. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5079\u20135088 (2021)","DOI":"10.1109\/CVPR46437.2021.00504"},{"key":"29_CR51","doi-asserted-by":"crossref","unstructured":"Wang, X., Wu, J., Chen, J., Li, L., Wang, Y.F., Wang, W.Y.: Vatex: A large-scale, high-quality multilingual dataset for video-and-language research. In: IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 4581\u20134591 (2019)","DOI":"10.1109\/ICCV.2019.00468"},{"key":"29_CR52","doi-asserted-by":"crossref","unstructured":"Wei, J., Xu, X., Yang, Y., Ji, Y., Wang, Z., Shen, H.T.: Universal weighting metric learning for cross-modal matching. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 13005\u201313014 (2020)","DOI":"10.1109\/CVPR42600.2020.01302"},{"key":"29_CR53","doi-asserted-by":"crossref","first-page":"6534","DOI":"10.1109\/TPAMI.2021.3088863","volume":"44","author":"J Wei","year":"2021","unstructured":"Wei, J., Yang, Y., Xu, X., Zhu, X., Shen, H.T.: Universal weighting metric learning for cross-modal retrieval. IEEE Trans. Pattern Anal. Mach. Intell. 44, 6534\u20136545 (2021)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"29_CR54","doi-asserted-by":"crossref","unstructured":"Wray, M., Larlus, D., Csurka, G., Damen, D.: Fine-grained action retrieval through multiple parts-of-speech embeddings. In: IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 450\u2013459 (2019)","DOI":"10.1109\/ICCV.2019.00054"},{"key":"29_CR55","doi-asserted-by":"crossref","unstructured":"Xie, S., Girshick, R., Doll\u00e1r, P., Tu, Z., He, K.: Aggregated residual transformations for deep neural networks. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1492\u20131500 (2017)","DOI":"10.1109\/CVPR.2017.634"},{"key":"29_CR56","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: Msr-vtt: A large video description dataset for bridging video and language. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5288\u20135296 (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"29_CR57","doi-asserted-by":"crossref","unstructured":"Xu, R., Xiong, C., Chen, W., Corso, J.: Jointly modeling deep video and compositional text to bridge vision and language in a unified framework. In: AAAI Conference on Artificial Intelligence (AAAI) (2015)","DOI":"10.1609\/aaai.v29i1.9512"},{"key":"29_CR58","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"153","DOI":"10.1007\/978-3-030-01240-3_10","volume-title":"Computer Vision \u2013 ECCV 2018","author":"T Yang","year":"2018","unstructured":"Yang, T., Chan, A.B.: Learning dynamic memory networks for object tracking. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11213, pp. 153\u2013169. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01240-3_10"},{"key":"29_CR59","doi-asserted-by":"crossref","unstructured":"Yang, X., Dong, J., Cao, Y., Wang, X., Wang, M., Chua, T.S.: Tree-augmented cross-modal encoding for complex-query video retrieval. In: International ACM SIGIR Conference on Research and Development in Information Retrieval (ACM SIGIR), pp. 1339\u20131348 (2020)","DOI":"10.1145\/3397271.3401151"},{"key":"29_CR60","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"487","DOI":"10.1007\/978-3-030-01234-2_29","volume-title":"Computer Vision \u2013 ECCV 2018","author":"Y Yu","year":"2018","unstructured":"Yu, Y., Kim, J., Kim, G.: A joint sequence fusion model for video question answering and retrieval. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11211, pp. 487\u2013503. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01234-2_29"},{"key":"29_CR61","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"385","DOI":"10.1007\/978-3-030-01261-8_23","volume-title":"Computer Vision \u2013 ECCV 2018","author":"B Zhang","year":"2018","unstructured":"Zhang, B., Hu, H., Sha, F.: Cross-modal and hierarchical modeling of video and text. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11217, pp. 385\u2013401. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01261-8_23"},{"key":"29_CR62","doi-asserted-by":"crossref","unstructured":"Zhu, L., Yang, Y.: Inflated episodic memory with region self-attention for long-tailed visual recognition. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4344\u20134353 (2020)","DOI":"10.1109\/CVPR42600.2020.00440"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19781-9_29","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,12]],"date-time":"2024-03-12T16:39:28Z","timestamp":1710261568000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19781-9_29"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031197802","9783031197819"],"references-count":62,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19781-9_29","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"23 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}