{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,28]],"date-time":"2026-01-28T20:24:44Z","timestamp":1769631884263,"version":"3.49.0"},"publisher-location":"Cham","reference-count":28,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031479687","type":"print"},{"value":"9783031479694","type":"electronic"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-47969-4_13","type":"book-chapter","created":{"date-parts":[[2023,11,30]],"date-time":"2023-11-30T20:02:06Z","timestamp":1701374526000},"page":"160-171","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Zero-Shot Video Moment Retrieval Using BLIP-Based Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-4337-3955","authenticated-orcid":false,"given":"Jobin Idiculla","family":"Wattasseril","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5683-2290","authenticated-orcid":false,"given":"Sumit","family":"Shekhar","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8981-8583","authenticated-orcid":false,"given":"J\u00fcrgen","family":"D\u00f6llner","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3861-5759","authenticated-orcid":false,"given":"Matthias","family":"Trapp","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,12,1]]},"reference":[{"key":"13_CR1","doi-asserted-by":"publisher","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: CVPR 2017, pp. 4724\u20134733. IEEE Computer Society (2017). https:\/\/doi.org\/10.1109\/CVPR.2017.502","DOI":"10.1109\/CVPR.2017.502"},{"key":"13_CR2","doi-asserted-by":"crossref","unstructured":"Chen, J., Luo, W., Zhang, W., Ma, L.: Explore inter-contrast between videos via composition for weakly supervised temporal sentence grounding. In: AAAI 2022, pp. 267\u2013275. AAAI Press (2022). https:\/\/ojs.aaai.org\/index.php\/AAAI\/article\/view\/19902","DOI":"10.1609\/aaai.v36i1.19902"},{"key":"13_CR3","doi-asserted-by":"crossref","unstructured":"Choe, T.E., Lee, M.W., Guo, F., Taylor, G., Yu, L., Haering, N.: Semantic video event search for surveillance video. In: ICCV Workshops 2011, pp. 1963\u20131970. IEEE (2011)","DOI":"10.1109\/ICCVW.2011.6130489"},{"key":"13_CR4","doi-asserted-by":"publisher","unstructured":"Diwan, A., Peng, P., Mooney, R.J.: Zero-shot video moment retrieval with off-the-shelf models. CoRR abs\/2211.02178 (2022). https:\/\/doi.org\/10.48550\/arXiv.2211.02178","DOI":"10.48550\/arXiv.2211.02178"},{"key":"13_CR5","doi-asserted-by":"publisher","unstructured":"Du, Y., Liu, Z., Li, J., Zhao, W.X.: A survey of vision-language pre-trained models. In: IJCAI 2022, pp. 5436\u20135443. ijcai.org (2022). https:\/\/doi.org\/10.24963\/ijcai.2022\/762","DOI":"10.24963\/ijcai.2022\/762"},{"key":"13_CR6","doi-asserted-by":"publisher","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: Slowfast networks for video recognition. In: ICCV 2019, pp. 6201\u20136210. IEEE (2019). https:\/\/doi.org\/10.1109\/ICCV.2019.00630","DOI":"10.1109\/ICCV.2019.00630"},{"key":"13_CR7","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"130","DOI":"10.1007\/978-3-031-20059-5_8","volume-title":"Computer Vision \u2013 ECCV 2022","author":"J Hao","year":"2022","unstructured":"Hao, J., Sun, H., Ren, P., Wang, J., Qi, Q., Liao, J.: Can shuffling video benefit temporal bias problem: a novel training framework for temporal grounding. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13696, pp. 130\u2013147. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20059-5_8"},{"key":"13_CR8","doi-asserted-by":"publisher","unstructured":"Hendricks, L.A., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.C.: Localizing moments in video with natural language. In: ICCV 2017, pp. 5804\u20135813. IEEE Computer Society (2017). https:\/\/doi.org\/10.1109\/ICCV.2017.618","DOI":"10.1109\/ICCV.2017.618"},{"key":"13_CR9","doi-asserted-by":"publisher","unstructured":"Huang, J., Worring, M.: Query-controllable video summarization. In: ICMR 2020, pp. 242\u2013250. ACM (2020). https:\/\/doi.org\/10.1145\/3372278.3390695","DOI":"10.1145\/3372278.3390695"},{"key":"13_CR10","doi-asserted-by":"publisher","unstructured":"Leake, M., Davis, A., Truong, A., Agrawala, M.: Computational video editing for dialogue-driven scenes. ACM Trans. Graph. 36(4), 130:1\u2013130:14 (2017). https:\/\/doi.org\/10.1145\/3072959.3073653","DOI":"10.1145\/3072959.3073653"},{"key":"13_CR11","unstructured":"Lei, J., Berg, T.L., Bansal, M.: QVHighlights: detecting moments and highlights in videos via natural language queries. CoRR abs\/2107.09609 (2021). arxiv.org\/abs\/2107.09609"},{"key":"13_CR12","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"447","DOI":"10.1007\/978-3-030-58589-1_27","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Lei","year":"2020","unstructured":"Lei, J., Yu, L., Berg, T.L., Bansal, M.: TVR: a large-scale dataset for video-subtitle moment retrieval. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12366, pp. 447\u2013463. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58589-1_27"},{"key":"13_CR13","doi-asserted-by":"publisher","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.C.H.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. CoRR abs\/2301.12597 (2023). https:\/\/doi.org\/10.48550\/arXiv.2301.12597","DOI":"10.48550\/arXiv.2301.12597"},{"key":"13_CR14","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.C.H.: BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: ICML 2022, vol. 162, pp. 12888\u201312900. PMLR (2022). https:\/\/proceedings.mlr.press\/v162\/li22n.html"},{"key":"13_CR15","doi-asserted-by":"publisher","unstructured":"Liu, D., et al.: Context-aware biaffine localizing network for temporal sentence grounding. In: CVPR 2021, pp. 11235\u201311244. Computer Vision Foundation\/IEEE (2021). https:\/\/doi.org\/10.1109\/CVPR46437.2021.01108","DOI":"10.1109\/CVPR46437.2021.01108"},{"key":"13_CR16","doi-asserted-by":"publisher","unstructured":"Liu, Y., Li, S., Wu, Y., Chen, C.W., Shan, Y., Qie, X.: UMT: unified multi-modal transformers for joint video moment retrieval and highlight detection. In: CVPR 2022, pp. 3032\u20133041. IEEE (2022). https:\/\/doi.org\/10.1109\/CVPR52688.2022.00305","DOI":"10.1109\/CVPR52688.2022.00305"},{"key":"13_CR17","doi-asserted-by":"crossref","unstructured":"Moon, W., Hyun, S., Park, S., Park, D., Heo, J.P.: Query-dependent video representation for moment retrieval and highlight detection. In: CVPR 2023, pp. 23023\u201323033 (2023)","DOI":"10.1109\/CVPR52729.2023.02205"},{"key":"13_CR18","doi-asserted-by":"publisher","unstructured":"Nam, J., Ahn, D., Kang, D., Ha, S.J., Choi, J.: Zero-shot natural language video localization. In: ICCV 2021, pp. 1450\u20131459. IEEE (2021). https:\/\/doi.org\/10.1109\/ICCV48922.2021.00150","DOI":"10.1109\/ICCV48922.2021.00150"},{"key":"13_CR19","unstructured":"Song, Y., Wang, J., Ma, L., Yu, Z., Yu, J.: Weakly-supervised multi-level attentional reconstruction network for grounding textual queries in videos. CoRR abs\/2003.07048 (2020). arxiv.org\/abs\/2003.07048"},{"key":"13_CR20","doi-asserted-by":"publisher","unstructured":"Tang, K., Bao, Y., Zhao, Z., Zhu, L., Lin, Y., Peng, Y.: AutoHighlight: automatic highlights detection and segmentation in soccer matches. In: IEEE BigData 2018, pp. 4619\u20134624. IEEE (2018). https:\/\/doi.org\/10.1109\/BigData.2018.8621906","DOI":"10.1109\/BigData.2018.8621906"},{"key":"13_CR21","doi-asserted-by":"publisher","unstructured":"Tellex, S., Kollar, T., Shaw, G., Roy, N., Roy, D.: Grounding spatial language for video search. In: ICMI-MLMI 2010, pp. 31:1\u201331:8. ACM (2010). https:\/\/doi.org\/10.1145\/1891903.1891944","DOI":"10.1145\/1891903.1891944"},{"key":"13_CR22","doi-asserted-by":"publisher","unstructured":"Wang, G., Wu, X., Liu, Z., Yan, J.: Prompt-based zero-shot video moment retrieval. In: MM 2022, pp. 413\u2013421. ACM (2022). https:\/\/doi.org\/10.1145\/3503161.3548004","DOI":"10.1145\/3503161.3548004"},{"issue":"6","key":"13_CR23","doi-asserted-by":"publisher","first-page":"177","DOI":"10.1145\/3355089.3356520","volume":"38","author":"M Wang","year":"2019","unstructured":"Wang, M., Yang, G.W., Hu, S.M., Yau, S.T., Shamir, A., et al.: Write-a-video: computational video montage from themed text. ACM Trans. Graph. 38(6), 177\u20131 (2019)","journal-title":"ACM Trans. Graph."},{"key":"13_CR24","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"318","DOI":"10.1007\/978-3-030-01267-0_19","volume-title":"Computer Vision \u2013 ECCV 2018","author":"S Xie","year":"2018","unstructured":"Xie, S., Sun, C., Huang, J., Tu, Z., Murphy, K.: Rethinking spatiotemporal feature learning: speed-accuracy trade-offs in video classification. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11219, pp. 318\u2013335. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01267-0_19"},{"key":"13_CR25","doi-asserted-by":"publisher","unstructured":"Xu, M., et al.: Boundary-sensitive pre-training for temporal localization in videos. In: ICCV 2021, pp. 7200\u20137210. IEEE (2021). https:\/\/doi.org\/10.1109\/ICCV48922.2021.00713","DOI":"10.1109\/ICCV48922.2021.00713"},{"key":"13_CR26","doi-asserted-by":"publisher","unstructured":"Zeng, Y., Cao, D., Lu, S., Zhang, H., Xu, J., Qin, Z.: Moment is important: language-based video moment retrieval via adversarial learning. ACM Trans. Multim. Comput. Commun. Appl. 18(2), 56:1\u201356:21 (2022). https:\/\/doi.org\/10.1145\/3478025","DOI":"10.1145\/3478025"},{"key":"13_CR27","unstructured":"Zhang, H., Sun, A., Jing, W., Zhou, J.T.: The elements of temporal sentence grounding in videos: a survey and future directions. CoRR abs\/2201.08071 (2022). arxiv.org\/abs\/2201.08071"},{"key":"13_CR28","doi-asserted-by":"crossref","unstructured":"Zhang, S., Peng, H., Fu, J., Luo, J.: Learning 2D temporal adjacent networks for moment localization with natural language. In: AAAI 2020, pp. 12870\u201312877. AAAI Press (2020). https:\/\/ojs.aaai.org\/index.php\/AAAI\/article\/view\/6984","DOI":"10.1609\/aaai.v34i07.6984"}],"container-title":["Lecture Notes in Computer Science","Advances in Visual Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-47969-4_13","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,30]],"date-time":"2023-11-30T20:03:52Z","timestamp":1701374632000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-47969-4_13"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031479687","9783031479694"],"references-count":28,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-47969-4_13","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"1 December 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ISVC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Symposium on Visual Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Lake Tahoe, NV","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"USA","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 October 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"isvc2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.isvc.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"25","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"58","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"232% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.3","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"43 (oral), 15 (poster),  25 (special tracks) out of 34 submissions","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}