{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T20:11:17Z","timestamp":1780344677579,"version":"3.54.1"},"publisher-location":"Cham","reference-count":36,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031250682","type":"print"},{"value":"9783031250699","type":"electronic"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-25069-9_40","type":"book-chapter","created":{"date-parts":[[2023,2,14]],"date-time":"2023-02-14T00:15:46Z","timestamp":1676333746000},"page":"627-643","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":10,"title":["Are All Combinations Equal? Combining Textual and\u00a0Visual Features with\u00a0Multiple Space Learning for\u00a0Text-Based Video Retrieval"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1852-4545","authenticated-orcid":false,"given":"Damianos","family":"Galanopoulos","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0121-4364","authenticated-orcid":false,"given":"Vasileios","family":"Mezaris","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2023,2,14]]},"reference":[{"key":"40_CR1","unstructured":"Awad, G., et al.: Evaluating multiple video understanding and retrieval tasks at trecvid 2021. In: Proceedings of TRECVID 2021. NIST, USA (2021)"},{"key":"40_CR2","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: Frozen in time: a joint video and image encoder for end-to-end retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1728\u20131738 (2021)","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"40_CR3","doi-asserted-by":"crossref","unstructured":"Caba Heilbron, F., Escorcia, V., B., G., Niebles, J.C.: Activitynet: a large-scale video benchmark for human activity understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 961\u2013970 (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"40_CR4","doi-asserted-by":"crossref","unstructured":"Chen, A., Hu, F., Wang, Z., Zhou, F., Li, X.: What matters for ad-hoc video search? a large-scale evaluation on trecvid. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2317\u20132322 (2021)","DOI":"10.1109\/ICCVW54120.2021.00262"},{"key":"40_CR5","doi-asserted-by":"crossref","unstructured":"Chen, S., Zhao, Y., Jin, Q., Wu, Q.: Fine-grained video-text retrieval with hierarchical graph reasoning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10638\u201310647 (2020)","DOI":"10.1109\/CVPR42600.2020.01065"},{"key":"40_CR6","unstructured":"Cheng, X., Lin, H., Wu, X., Yang, F., Shen, D.: Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss. arXiv preprint arXiv:2109.04290 (2021)"},{"key":"40_CR7","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: arXiv preprint arXiv:1810.04805 (2018)"},{"issue":"12","key":"40_CR8","doi-asserted-by":"publisher","first-page":"3377","DOI":"10.1109\/TMM.2018.2832602","volume":"20","author":"J Dong","year":"2018","unstructured":"Dong, J., Li, X., Snoek, C.G.M.: Predicting visual features from text for image and video caption retrieval. IEEE Trans. Multimedia 20(12), 3377\u20133388 (2018)","journal-title":"IEEE Trans. Multimedia"},{"key":"40_CR9","doi-asserted-by":"crossref","unstructured":"Dong, J., et al.: Dual encoding for zero-example video retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9346\u20139355 (2019)","DOI":"10.1109\/CVPR.2019.00957"},{"issue":"8","key":"40_CR10","first-page":"4065","volume":"44","author":"J Dong","year":"2022","unstructured":"Dong, J., Li, X., Xu, C., Yang, X., Yang, G., Wang, X., Wang, M.: Dual encoding for video retrieval by text. IEEE Trans. Pattern Anal. Mach. Intell. 44(8), 4065\u20134080 (2022)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"40_CR11","doi-asserted-by":"crossref","unstructured":"Dong, J., Wang, Y., Chen, X., Qu, X., Li, X., He, Y., Wang, X.: Reading-strategy inspired visual representation learning for text-to-video retrieval. IEEE Trans. Circuits Syst. Video Technol. (2022)","DOI":"10.1109\/TCSVT.2022.3150959"},{"key":"40_CR12","unstructured":"Faghri, F., Fleet, D.J., Kiros, J.R., Fidler, S.: Vse++: improving visual-semantic embeddings with hard negatives. In: Proceedings of the British Machine Vision Conference (BMVC) (2018)"},{"key":"40_CR13","doi-asserted-by":"crossref","unstructured":"Galanopoulos, D., Mezaris, V.: Hard-negatives or Non-negatives? a hard-negative selection strategy for cross-modal retrieval using the improved marginal ranking loss. In: 2021 IEEE\/CVF ICCVW (2021)","DOI":"10.1109\/ICCVW54120.2021.00261"},{"key":"40_CR14","doi-asserted-by":"crossref","unstructured":"Galanopoulos, D., Mezaris, V.: Attention mechanisms, signal encodings and fusion strategies for improved ad-hoc video search with dual encoding networks. In: Proceedings of the ACM International Conference on Multimedia Retrieval (ICMR 2020). ACM (2020)","DOI":"10.1145\/3372278.3390737"},{"key":"40_CR15","doi-asserted-by":"crossref","unstructured":"Ge, Y., Ge, Y., Liu, X., Li, D., Shan, Y., Qie, X., Luo, P.: Bridging video-text retrieval with multiple choice questions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16167\u201316176 (2022)","DOI":"10.1109\/CVPR52688.2022.01569"},{"issue":"10","key":"40_CR16","doi-asserted-by":"publisher","first-page":"2089","DOI":"10.1109\/TPAMI.2016.2627563","volume":"39","author":"A Habibian","year":"2017","unstructured":"Habibian, A., Mensink, T., Snoek, C.G.M.: Video2vec embeddings recognize events when examples are scarce. IEEE Trans. Pattern Anal. Mach. Intell. 39(10), 2089\u20132103 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"40_CR17","doi-asserted-by":"crossref","unstructured":"Habibian, A., Mensink, T., Snoek, C.G.: Videostory: a new multimedia embedding for few-example recognition and translation of events. In: Proceedings of the 22nd ACM International Conference on Multimedia, MM 2014, pp. 17\u201326. ACM, New York (2014)","DOI":"10.1145\/2647868.2654913"},{"key":"40_CR18","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"40_CR19","doi-asserted-by":"crossref","unstructured":"Li, X., Xu, C., Yang, G., Chen, Z., Dong, J.: W2VV++ fully deep learning for ad-hoc video search. In: Proceedings of the 27th ACM International Conference on Multimedia, pp. 1786\u20131794 (2019)","DOI":"10.1145\/3343031.3350906"},{"key":"40_CR20","doi-asserted-by":"publisher","first-page":"4351","DOI":"10.1109\/TMM.2020.3042067","volume":"23","author":"X Li","year":"2021","unstructured":"Li, X., Zhou, F., Xu, C., Ji, J., Yang, G.: SEA: sentence encoder assembly for video retrieval by textual queries. IEEE Trans. Multimed. 23, 4351\u20134362 (2021)","journal-title":"IEEE Trans. Multimed."},{"key":"40_CR21","doi-asserted-by":"crossref","unstructured":"Li, Y., Song, Y., et al.: TGIF: a new dataset and benchmark on animated gif description. In: Proceedings of IEEE CVPR, pp. 4641\u20134650 (2016)","DOI":"10.1109\/CVPR.2016.502"},{"key":"40_CR22","doi-asserted-by":"crossref","unstructured":"Liu, S., Fan, H., Qian, S., Chen, Y., Ding, W., Wang, Z.: Hit: Hierarchical transformer with momentum contrast for video-text retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 11915\u201311925 (2021)","DOI":"10.1109\/ICCV48922.2021.01170"},{"key":"40_CR23","unstructured":"Liu, Y., Albanie, S., Nagrani, A., Zisserman, A.: Use what you have: Video retrieval using representations from collaborative experts. arXiv preprint arXiv:1907.13487 (2019)"},{"key":"40_CR24","doi-asserted-by":"crossref","unstructured":"Lu, Y.J., Zhang, H., de Boer, M., Ngo, C.W.: Event detection with zero example: Select the right and suppress the wrong concepts. In: Proceedings of the 2016 ACM on International Conference on Multimedia Retrieval. p. 127\u2013134. ICMR \u201916, ACM, New York, NY, USA (2016)","DOI":"10.1145\/2911996.2912015"},{"key":"40_CR25","doi-asserted-by":"crossref","unstructured":"Mahajan, D., Girshick, R., Ramanathan, V., He, K., Paluri, M., Li, Y., Bharambe, A., Van Der Maaten, L.: Exploring the limits of weakly supervised pretraining. In: Proceedings of the European conference on computer vision (ECCV). pp. 181\u2013196 (2018)","DOI":"10.1007\/978-3-030-01216-8_12"},{"key":"40_CR26","doi-asserted-by":"crossref","unstructured":"Markatopoulou, F., Galanopoulos, D., Mezaris, V., Patras, I.: Query and keyframe representations for ad-hoc video search. In: Proceedings of the 2017 ACM International Conference on Multimedia Retrieval. pp. 407\u2013411. ICMR \u201917, ACM (2017)","DOI":"10.1145\/3078971.3079041"},{"key":"40_CR27","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J.B., Tapaswi, M., Laptev, I., Sivic, J.: Howto100m: Learning a text-video embedding by watching hundred million narrated video clips. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 2630\u20132640 (2019)","DOI":"10.1109\/ICCV.2019.00272"},{"key":"40_CR28","unstructured":"Mikolov, T., Chen, K., Corrado, G., Dean, J.: Efficient estimation of word representations in vector space. In: 1st International Conference on Learning Representations, Workshop Track Proceedings. ICLR \u201913 (2013)"},{"key":"40_CR29","doi-asserted-by":"crossref","unstructured":"Portillo-Quintero, J.A., Ortiz-Bayliss, J.C., Terashima-Mar\u00edn, H.: A straightforward framework for video retrieval using clip. In: Mexican Conference on Pattern Recognition. pp. 3\u201312. Springer (2021)","DOI":"10.1007\/978-3-030-77004-4_1"},{"key":"40_CR30","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., Krueger, G., Sutskever, I.: Learning transferable visual models from natural language supervision. In: Proc. of the 38th Int. Conf. on Machine Learning (ICML) (2021)"},{"key":"40_CR31","doi-asserted-by":"crossref","unstructured":"Song, Y., Soleymani, M.: Polysemous visual-semantic embedding for cross-modal retrieval. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR). pp. 1979\u20131988 (2019)","DOI":"10.1109\/CVPR.2019.00208"},{"key":"40_CR32","doi-asserted-by":"crossref","unstructured":"Wang, X., Wu, J., et al.: Vatex: A large-scale, high-quality multilingual dataset for video-and-language research. In: Proc. of the IEEE Int. Conf. on Computer Vision. pp. 4581\u20134591 (2019)","DOI":"10.1109\/ICCV.2019.00468"},{"key":"40_CR33","doi-asserted-by":"crossref","unstructured":"Wu, J., Ngo, C.W.: Interpretable embedding for ad-hoc video search. In: Proceedings of the 28th ACM International Conference on Multimedia. p. 3357\u20133366. ACM, New York, NY, USA (2020)","DOI":"10.1145\/3394171.3413916"},{"key":"40_CR34","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: MSR-VTT: A large video description dataset for bridging video and language. In: Proc. of IEEE CVPR. pp. 5288\u20135296 (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"40_CR35","doi-asserted-by":"crossref","unstructured":"Yang, X., Dong, J., Cao, Y., Wang, X., Wang, M., Chua, T.S.: Tree-augmented cross-modal encoding for complex-query video retrieval. In: Proceedings of the 43rd international ACM SIGIR conference on research and development in information retrieval. pp. 1339\u20131348 (2020)","DOI":"10.1145\/3397271.3401151"},{"key":"40_CR36","doi-asserted-by":"crossref","unstructured":"Yu, Y., Kim, J., Kim, G.: A joint sequence fusion model for video question answering and retrieval. In: Proceedings of the European Conference on Computer Vision (ECCV). pp. 471\u2013487 (2018)","DOI":"10.1007\/978-3-030-01234-2_29"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-25069-9_40","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,7]],"date-time":"2024-03-07T12:58:00Z","timestamp":1709816280000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-25069-9_40"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031250682","9783031250699"],"references-count":36,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-25069-9_40","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"14 February 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"From the workshops, 367 reviewed full papers have been selected for publication","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}