{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T06:34:14Z","timestamp":1743057254943,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":33,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819786190"},{"type":"electronic","value":"9789819786206"}],"license":[{"start":{"date-parts":[[2024,10,20]],"date-time":"2024-10-20T00:00:00Z","timestamp":1729382400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,20]],"date-time":"2024-10-20T00:00:00Z","timestamp":1729382400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-97-8620-6_22","type":"book-chapter","created":{"date-parts":[[2024,10,19]],"date-time":"2024-10-19T21:02:10Z","timestamp":1729371730000},"page":"318-332","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Uncertainty-Aware with\u00a0Negative Samples for\u00a0Video-Text Retrieval"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-9804-4277","authenticated-orcid":false,"given":"Weitao","family":"Song","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1559-2498","authenticated-orcid":false,"given":"Weiran","family":"Chen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0435-1779","authenticated-orcid":false,"given":"Jialiang","family":"Xu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6965-4158","authenticated-orcid":false,"given":"Yi","family":"Ji","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1669-1878","authenticated-orcid":false,"given":"Ying","family":"Li","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1495-5138","authenticated-orcid":false,"given":"Chunping","family":"Liu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,20]]},"reference":[{"key":"22_CR1","doi-asserted-by":"crossref","unstructured":"Anne\u00a0Hendricks, L., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.: Localizing moments in video with natural language. In: Proceedings of the IEEE International Conference on Computer Vision. pp. 5803\u20135812 (2017)","DOI":"10.1109\/ICCV.2017.618"},{"key":"22_CR2","doi-asserted-by":"crossref","unstructured":"Bogolin, S.V., Croitoru, I., Jin, H., Liu, Y., Albanie, S.: Cross modal retrieval with querybank normalisation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5194\u20135205 (2022)","DOI":"10.1109\/CVPR52688.2022.00513"},{"key":"22_CR3","unstructured":"Chen, D., Dolan, W.B.: Collecting highly parallel data for paraphrase evaluation. In: Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies, pp. 190\u2013200 (2011)"},{"key":"22_CR4","unstructured":"Cheng, X., Lin, H., Wu, X., Yang, F., Shen, D.: Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss. arXiv preprint arXiv:2109.04290 (2021)"},{"key":"22_CR5","doi-asserted-by":"crossref","unstructured":"Fang, B., Wu, W., Liu, C., Zhou, Y., Song, Y., Wang, W., Shu, X., Ji, X., Wang, J.: Uatvr: uncertainty-adaptive text-video retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13723\u201313733 (2023)","DOI":"10.1109\/ICCV51070.2023.01262"},{"key":"22_CR6","unstructured":"Fang, H., Xiong, P., Xu, L., Chen, Y.: Clip2video: mastering video-text retrieval via image clip. arXiv preprint arXiv:2106.11097 (2021)"},{"key":"22_CR7","doi-asserted-by":"crossref","unstructured":"Ge, Y., Ge, Y., Liu, X., Li, D., Shan, Y., Qie, X., Luo, P.: Bridging video-text retrieval with multiple choice questions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16167\u201316176 (2022)","DOI":"10.1109\/CVPR52688.2022.01569"},{"key":"22_CR8","doi-asserted-by":"crossref","unstructured":"Gorti, S.K., Vouitsis, N., Ma, J., Golestan, K., Volkovs, M., Garg, A., Yu, G.: X-pool: cross-modal language-video attention for text-video retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5006\u20135015 (2022)","DOI":"10.1109\/CVPR52688.2022.00495"},{"key":"22_CR9","unstructured":"Goyal, P., Doll\u00e1r, P., Girshick, R., Noordhuis, P., Wesolowski, L., Kyrola, A., Tulloch, A., Jia, Y., He, K.: Accurate, large minibatch sgd: training imagenet in 1 hour. arXiv preprint arXiv:1706.02677 (2017)"},{"key":"22_CR10","doi-asserted-by":"crossref","unstructured":"Hadsell, R., Chopra, S., LeCun, Y.: Dimensionality reduction by learning an invariant mapping. In: 2006 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR\u201906). vol.\u00a02, pp. 1735\u20131742. IEEE (2006)","DOI":"10.1109\/CVPR.2006.100"},{"key":"22_CR11","unstructured":"Jiang, J., Min, S., Kong, W., Wang, H., Li, Z., Liu, W.: Tencent text-video retrieval: hierarchical cross-modal interactions with multi-level representations. IEEE Access (2022)"},{"key":"22_CR12","doi-asserted-by":"crossref","unstructured":", Jin, P., Huang, J., Xiong, P., Tian, S., Liu, C., Ji, X., Yuan, L., Chen, J.: Video-text as game players: hierarchical banzhaf interaction for cross-modal representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2472\u20132482 (2023)","DOI":"10.1109\/CVPR52729.2023.00244"},{"key":"22_CR13","unstructured":"Kingma, D.P., Ba, J.: Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"22_CR14","doi-asserted-by":"crossref","unstructured":"Liu, A.H., Jin, S., Lai, C.I.J., Rouditchenko, A., Oliva, A., Glass, J.: Cross-modal discrete representation learning. arXiv preprint arXiv:2106.05438 (2021)","DOI":"10.18653\/v1\/2022.acl-long.215"},{"key":"22_CR15","doi-asserted-by":"crossref","unstructured":"Liu, R., Huang, J., Li, G., Feng, J., Wu, X., Li, T.H.: Revisiting temporal modeling for clip-based image-to-video knowledge transferring. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6555\u20136564 (2023)","DOI":"10.1109\/CVPR52729.2023.00634"},{"key":"22_CR16","doi-asserted-by":"crossref","unstructured":"Liu, Y., Xiong, P., Xu, L., Cao, S., Jin, Q.: Ts2-net: token shift and selection transformer for text-video retrieval. In: European conference on computer vision, pp. 319\u2013335. Springer (2022)","DOI":"10.1007\/978-3-031-19781-9_19"},{"key":"22_CR17","unstructured":"Loshchilov, I., Hutter, F.S.: Stochastic gradient descent with warm restarts (2016)"},{"key":"22_CR18","doi-asserted-by":"publisher","first-page":"293","DOI":"10.1016\/j.neucom.2022.07.028","volume":"508","author":"H Luo","year":"2022","unstructured":"Luo, H., Ji, L., Zhong, M., Chen, Y., Lei, W., Duan, N., Li, T.: Clip4clip: an empirical study of clip for end to end video clip retrieval and captioning. Neurocomputing 508, 293\u2013304 (2022)","journal-title":"Neurocomputing"},{"key":"22_CR19","doi-asserted-by":"crossref","unstructured":"Ma, Y., Xu, G., Sun, X., Yan, M., Zhang, J., Ji, R.: X-clip: end-to-end multi-grained contrastive learning for video-text retrieval. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 638\u2013647 (2022)","DOI":"10.1145\/3503161.3547910"},{"key":"22_CR20","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"22_CR21","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"22_CR22","doi-asserted-by":"crossref","unstructured":"Rasheed, H., Khattak, M.U., Maaz, M., Khan, S., Khan, F.S.: Fine-tuned clip models are efficient video learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6545\u20136554 (2023)","DOI":"10.1109\/CVPR52729.2023.00633"},{"key":"22_CR23","doi-asserted-by":"crossref","unstructured":"Schroff, F., Kalenichenko, D., Philbin, J.: Facenet: A unified embedding for face recognition and clustering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 815\u2013823 (2015)","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"22_CR24","doi-asserted-by":"crossref","unstructured":"Song, X., Chen, J., Jiang, Y.G.: Relation triplet construction for cross-modal text-to-video retrieval. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 4759\u20134767 (2023)","DOI":"10.1145\/3581783.3611940"},{"key":"22_CR25","unstructured":"Torabi, A., Tandon, N., Sigal, L.: Learning language-visual embedding for movie understanding with natural-language. arXiv preprint arXiv:1609.08124 (2016)"},{"key":"22_CR26","unstructured":"Wang, X., Chen, H., Tang, S., Wu, Z., Zhu, W.: Disentangled representation learning. arXiv preprint arXiv:2211.11695 (2022)"},{"key":"22_CR27","doi-asserted-by":"crossref","unstructured":"Wang, X., Wu, J., Chen, J., Li, L., Wang, Y.F., Wang, W.Y.: Vatex: a large-scale, high-quality multilingual dataset for video-and-language research. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4581\u20134591 (2019)","DOI":"10.1109\/ICCV.2019.00468"},{"key":"22_CR28","doi-asserted-by":"crossref","unstructured":"Wang, Z., Sung, Y.L., Cheng, F., Bertasius, G., Bansal, M.: Unified coarse-to-fine alignment for video-text retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2816\u20132827 (2023)","DOI":"10.1109\/ICCV51070.2023.00264"},{"key":"22_CR29","doi-asserted-by":"crossref","unstructured":"Wray, M., Doughty, H., Damen, D.: On semantic similarity in video retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3650\u20133660 (2021)","DOI":"10.1109\/CVPR46437.2021.00365"},{"key":"22_CR30","doi-asserted-by":"crossref","unstructured":"Wu, W., Luo, H., Fang, B., Wang, J., Ouyang, W.: Cap4video: What can auxiliary captions do for text-video retrieval? In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10704\u201310713 (2023)","DOI":"10.1109\/CVPR52729.2023.01031"},{"key":"22_CR31","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: Msr-vtt: a large video description dataset for bridging video and language. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5288\u20135296 (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"22_CR32","doi-asserted-by":"crossref","unstructured":"Zhao, S., Zhu, L., Wang, X., Yang, Y.: Centerclip: Token clustering for efficient text-video retrieval. In: Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 970\u2013981 (2022)","DOI":"10.1145\/3477495.3531950"},{"key":"22_CR33","doi-asserted-by":"crossref","unstructured":"Zolfaghari, M., Zhu, Y., Gehler, P., Brox, T.: Crossclr: Cross-modal contrastive learning for multi-modal video representations. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1450\u20131459 (2021)","DOI":"10.1109\/ICCV48922.2021.00148"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-8620-6_22","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,14]],"date-time":"2025-01-14T20:16:55Z","timestamp":1736885815000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-8620-6_22"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,20]]},"ISBN":["9789819786190","9789819786206"],"references-count":33,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-8620-6_22","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,20]]},"assertion":[{"value":"20 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Urumqi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2024.prcv.cn\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}