{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T14:55:03Z","timestamp":1743087303670,"version":"3.40.3"},"publisher-location":"Cham","reference-count":34,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031189128"},{"type":"electronic","value":"9783031189135"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-18913-5_49","type":"book-chapter","created":{"date-parts":[[2022,10,26]],"date-time":"2022-10-26T23:03:53Z","timestamp":1666825433000},"page":"638-650","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["YFormer: A New Transformer Architecture for\u00a0Video-Query Based Video Moment Retrieval"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7290-7838","authenticated-orcid":false,"given":"Shuwei","family":"Huo","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6072-337X","authenticated-orcid":false,"given":"Yuan","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4578-0904","authenticated-orcid":false,"given":"Haiyang","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,10,27]]},"reference":[{"key":"49_CR1","doi-asserted-by":"crossref","unstructured":"Bojanowski, P., et al.: Weakly-supervised alignment of video with text, pp. 4462\u20134470 (2015)","DOI":"10.1109\/ICCV.2015.507"},{"key":"49_CR2","doi-asserted-by":"crossref","unstructured":"Buch, S., Escorcia, V., Shen, C., Ghanem, B., Niebles, J.C.: SST: single-stream temporal action proposals. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2017)","DOI":"10.1109\/CVPR.2017.675"},{"key":"49_CR3","doi-asserted-by":"publisher","unstructured":"Cao, D., Yu, Z., Zhang, H., Fang, J., Nie, L., Tian, Q.: Video-based cross-modal recipe retrieval. In: Proceedings of the 27th ACM International Conference on Multimedia, MM 2019, pp. 1685\u20131693. Association for Computing Machinery, New York, NY, USA (2019). https:\/\/doi.org\/10.1145\/3343031.3351067","DOI":"10.1145\/3343031.3351067"},{"key":"49_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"49_CR5","doi-asserted-by":"publisher","unstructured":"Carreira, J., Zisserman, A.: Quo Vadis, action recognition? a new model and the kinetics dataset. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4724\u20134733 (2017). https:\/\/doi.org\/10.1109\/CVPR.2017.502","DOI":"10.1109\/CVPR.2017.502"},{"issue":"7","key":"49_CR6","first-page":"10551","volume":"34","author":"L Chen","year":"2020","unstructured":"Chen, L., Lu, C., Tang, S., Xiao, J., Li, X.: Rethinking the bottom-up framework for query-based video localization. Proc. AAAI Conf. Artif. Intell. 34(7), 10551\u201310558 (2020)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"49_CR7","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Conference of the North American Chapter of the Association for Computational Linguistics (2019)"},{"key":"49_CR8","unstructured":"Dosovitskiy, A., et al.: An image is worth 16\u00a0$$\\times $$\u00a016 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"49_CR9","doi-asserted-by":"crossref","unstructured":"Heilbron, F.C., Victor Escorcia, B.G., Niebles, J.C.: ActivityNet: a large-scale video benchmark for human activity understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 961\u2013970 (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"49_CR10","doi-asserted-by":"crossref","unstructured":"Feng, Y., Ma, L., Liu, W., Zhang, T., Luo, J.: Video re-localization. In: European Conference on Computer Vision (2018)","DOI":"10.1007\/978-3-030-01264-9_4"},{"key":"49_CR11","doi-asserted-by":"crossref","unstructured":"Feng, Y., Ma, L., Liu, W., Luo, J.: Spatio-temporal video re-localization by warp LSTM. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1288\u20131297 (2019)","DOI":"10.1109\/CVPR.2019.00138"},{"key":"49_CR12","doi-asserted-by":"crossref","unstructured":"Gao, J., Sun, C., Yang, Z., Nevatia, R.: TALL: temporal activity localization via language query, pp. 5277\u20135285 (2017)","DOI":"10.1109\/ICCV.2017.563"},{"key":"49_CR13","doi-asserted-by":"crossref","unstructured":"Habibian, A., Mensink, T., Snoek, C.: Composite concept discovery for zero-shot video event detection. In: Proceedings of International Conference on Multimedia Retrieval (2014)","DOI":"10.1145\/2578726.2578746"},{"key":"49_CR14","unstructured":"Hahn, M., Kadav, A., Rehg, J.M., Graf, H.: Tripping through time: efficient localization of activities in videos. arXiv abs\/1904.09936 (2020)"},{"key":"49_CR15","doi-asserted-by":"crossref","unstructured":"He, S., Luo, H., Wang, P., Wang, F., Li, H., Jiang, W.: TransReID: transformer-based object re-identification. arXiv preprint arXiv:2102.04378 (2021)","DOI":"10.1109\/ICCV48922.2021.01474"},{"key":"49_CR16","unstructured":"He, X., Pan, Y., Tang, M., Lv, Y.: Self-supervised video retrieval transformer network. arXiv abs\/2104.07993 (2021)"},{"key":"49_CR17","doi-asserted-by":"crossref","unstructured":"Heilbron, F.C., Escorcia, V., Ghanem, B., Niebles, J.C.: ActivityNet: a large-scale video benchmark for human activity understanding. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"49_CR18","doi-asserted-by":"crossref","unstructured":"Hendricks, L.A., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.C.: Localizing moments in video with natural language, pp. 5804\u20135813 (2017)","DOI":"10.1109\/ICCV.2017.618"},{"key":"49_CR19","doi-asserted-by":"crossref","unstructured":"Huang, H., et al.: Transferable representation learning in vision-and-language navigation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), October 2019","DOI":"10.1109\/ICCV.2019.00750"},{"issue":"10","key":"49_CR20","doi-asserted-by":"publisher","first-page":"2638","DOI":"10.1109\/TMM.2019.2905741","volume":"21","author":"G Kordopatis-Zilos","year":"2019","unstructured":"Kordopatis-Zilos, G., Papadopoulos, S., Patras, I., Kompatsiaris, I.: FIVR: fine-grained incident video retrieval. IEEE Trans. Multimedia 21(10), 2638\u20132652 (2019). https:\/\/doi.org\/10.1109\/TMM.2019.2905741","journal-title":"IEEE Trans. Multimedia"},{"key":"49_CR21","doi-asserted-by":"publisher","unstructured":"Liu, M., Wang, X., Nie, L., He, X., Chen, B., Chua, T.S.: Attentive moment retrieval in videos. In: The 41st International ACM SIGIR Conference on Research & Development in Information Retrieval, SIGIR 2018, pp. 15\u201324. Association for Computing Machinery, New York, NY, USA (2018). https:\/\/doi.org\/10.1145\/3209978.3210003","DOI":"10.1145\/3209978.3210003"},{"key":"49_CR22","doi-asserted-by":"crossref","unstructured":"Liu, M., Wang, X., Nie, L., Tian, Q., Chen, B., Chua, T.S.: Cross-modal moment localization in videos. In: Proceedings of the 26th ACM International Conference on Multimedia, MM 2018, pp. 843\u2013851. Association for Computing Machinery, New York, NY, USA (2018)","DOI":"10.1145\/3240508.3240549"},{"key":"49_CR23","unstructured":"Liu, Z., et al.: Video Swin transformer. CoRR abs\/2106.13230 (2021). https:\/\/arxiv.org\/abs\/2106.13230"},{"key":"49_CR24","doi-asserted-by":"crossref","unstructured":"Mithun, N.C., Paul, S., Roy-Chowdhury, A.K.: Weakly supervised video moment retrieval from text queries, pp. 11584\u201311593 (2019)","DOI":"10.1109\/CVPR.2019.01186"},{"issue":"7","key":"49_CR25","doi-asserted-by":"publisher","first-page":"3210","DOI":"10.1109\/TIP.2018.2814344","volume":"27","author":"J Song","year":"2018","unstructured":"Song, J., Zhang, H., Li, X., Gao, L., Wang, M., Hong, R.: Self-supervised video hashing with hierarchical binary auto-encoder. IEEE Trans. Image Process. 27(7), 3210\u20133221 (2018). https:\/\/doi.org\/10.1109\/TIP.2018.2814344","journal-title":"IEEE Trans. Image Process."},{"key":"49_CR26","doi-asserted-by":"publisher","first-page":"5589","DOI":"10.1109\/TIP.2021.3086591","volume":"30","author":"X Sun","year":"2021","unstructured":"Sun, X., Wang, H., He, B.: MABAN: multi-agent boundary-aware network for natural language moment retrieval. IEEE Trans. Image Process. 30, 5589\u20135599 (2021)","journal-title":"IEEE Trans. Image Process."},{"key":"49_CR27","doi-asserted-by":"crossref","unstructured":"Tellex, S., Roy, D.: Towards surveillance video search by natural language query. In: CIVR 2009 (2009)","DOI":"10.1145\/1646396.1646442"},{"key":"49_CR28","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). Curran Associates Inc., Red Hook, NY, USA (2017)"},{"key":"49_CR29","unstructured":"Vaswani, A., et al.: Attention is all you need. arXiv abs\/1706.03762 (2017)"},{"key":"49_CR30","doi-asserted-by":"crossref","unstructured":"Wang, X., et al.: Reinforced cross-modal matching and self-supervised imitation learning for vision-language navigation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), June 2019","DOI":"10.1109\/CVPR.2019.00679"},{"key":"49_CR31","unstructured":"Wu, J.Y., Li, G., Liu, S., Lin, L.: Tree-structured policy based progressive reinforcement learning for temporally language grounding in video. arXiv abs\/2001.06680 (2020)"},{"key":"49_CR32","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Mei, T., Zhu, W.: To find where you talk: temporal sentence localization in video with attention based location regression, July 2019","DOI":"10.1609\/aaai.v33i01.33019159"},{"key":"49_CR33","doi-asserted-by":"crossref","unstructured":"Zeng, Y., Cao, D., Wei, X., Liu, M., Zhao, Z., Qin, Z.: Multi-modal relational graph for cross-modal video moment retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2215\u20132224, June 2021","DOI":"10.1109\/CVPR46437.2021.00225"},{"key":"49_CR34","doi-asserted-by":"crossref","unstructured":"Zhang, S., Peng, H., Fu, J., Luo, J.: Learning 2D temporal adjacent networks for moment localization with natural language. In: American Association for Artificial Intelligence (AAAI) (2020)","DOI":"10.1609\/aaai.v34i07.6984"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-18913-5_49","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,26]],"date-time":"2022-10-26T23:28:59Z","timestamp":1666826939000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-18913-5_49"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031189128","9783031189135"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-18913-5_49","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"27 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shenzhen","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/en.prcv.cn\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"microsoft","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"564","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"233","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"41% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.03","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.35","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}