{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,15]],"date-time":"2026-04-15T17:59:27Z","timestamp":1776275967706,"version":"3.50.1"},"publisher-location":"Cham","reference-count":38,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031730092","type":"print"},{"value":"9783031730108","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,10]],"date-time":"2024-11-10T00:00:00Z","timestamp":1731196800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,10]],"date-time":"2024-11-10T00:00:00Z","timestamp":1731196800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73010-8_14","type":"book-chapter","created":{"date-parts":[[2024,11,9]],"date-time":"2024-11-09T13:10:10Z","timestamp":1731157810000},"page":"227-243","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["E3M: Zero-Shot Spatio-Temporal Video Grounding with\u00a0Expectation-Maximization Multimodal Modulation"],"prefix":"10.1007","author":[{"given":"Peijun","family":"Bao","sequence":"first","affiliation":[]},{"given":"Zihao","family":"Shao","sequence":"additional","affiliation":[]},{"given":"Wenhan","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Boon Poh","family":"Ng","sequence":"additional","affiliation":[]},{"given":"Alex C.","family":"Kot","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,10]]},"reference":[{"key":"14_CR1","doi-asserted-by":"crossref","unstructured":"Ahn, J., Kwak, S.: Learning pixel-level semantic affinity with image-level supervision for weakly supervised semantic segmentation. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00523"},{"key":"14_CR2","doi-asserted-by":"crossref","unstructured":"Antoine\u00a0Yang, Antoine\u00a0Miech, J.S.I.L., Schmid, C.: Tubedetr: Spatio-temporal video grounding with transformers. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01595"},{"key":"14_CR3","doi-asserted-by":"crossref","unstructured":"Bao, P., Shao, Z., Yang, W., Ng, B.P., Er, M.H., Kot, A.C.: Omnipotent distillation with LLMS for weakly-supervised natural language video localization: When divergence meets consistency. In: AAAI (2024)","DOI":"10.1609\/aaai.v38i2.27832"},{"key":"14_CR4","doi-asserted-by":"crossref","unstructured":"Bao, P., Xia, Y., Yang, W., Ng, B.P., Er, M.H., Kot, A.C.: Local-global multi-modal distillation for weakly-supervised temporal video grounding. In: AAAI (2024)","DOI":"10.1609\/aaai.v38i2.27831"},{"key":"14_CR5","doi-asserted-by":"crossref","unstructured":"Bao, P., Yang, W., Ng, B.P., Er, M.H., Kot, A.C.: Cross-modal label contrastive learning for unsupervised audio-visual event localization. In: AAAI (2023)","DOI":"10.1609\/aaai.v37i1.25093"},{"key":"14_CR6","doi-asserted-by":"crossref","unstructured":"Chen, J., Bao, W., Kong, Y.: Activity-driven weakly-supervised spatio-temporal grounding from untrimmed videos. In: ACM MM (2020)","DOI":"10.1145\/3394171.3413614"},{"key":"14_CR7","doi-asserted-by":"crossref","unstructured":"Jiang, K., He, X., Xu, R., Wang, X.E.: Comclip: training-free compositional image and text matching. In: NAACL (2024)","DOI":"10.18653\/v1\/2024.naacl-long.370"},{"key":"14_CR8","unstructured":"Jin, Y., Li, Y., Yuan, Z., Mu, Y.: Embracing consistency: a one-stage approach for spatio-temporal video grounding. In: NeurIPS (2022)"},{"key":"14_CR9","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"105","DOI":"10.1007\/978-3-031-19833-5_7","volume-title":"ECCV 2022","author":"C Ju","year":"2022","unstructured":"Ju, C., Han, T., Zheng, K., Zhang, Y., Xie, W.: Prompting visual-language models for efficient video understanding. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13695, pp. 105\u2013124. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19833-5_7"},{"key":"14_CR10","unstructured":"Kaiming\u00a0He, Xiangyu\u00a0Zhang, S.R., Sun, J.: Deep residual learning for image recognition. In: CVPR (2016)"},{"key":"14_CR11","unstructured":"Kalman, R.E.: A new approach to linear filtering and prediction problems. J. Basic Eng. (2011)"},{"key":"14_CR12","doi-asserted-by":"crossref","unstructured":"Kuhn, H.W.: The Hungarian method for the assignment problem. Naval Res. Logistics (1955)","DOI":"10.1002\/nav.3800020109"},{"key":"14_CR13","doi-asserted-by":"crossref","unstructured":"Li, M., et al.: Winner: Weakly-supervised hierarchical decomposition and alignment for spatio-temporal video grounding. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02211"},{"key":"14_CR14","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"14_CR15","doi-asserted-by":"crossref","unstructured":"Lin, Z., Tan, C., Hu, J., Jin, Z., Ye, T., Zheng, W.: Collaborative static and dynamic vision-language streams for spatio-temporal video grounding. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02212"},{"key":"14_CR16","doi-asserted-by":"crossref","unstructured":"Liu, R., Huang, J., Li, G., Feng, J., Wu, X., Li, T.H.: Revisiting temporal modeling for clip-based image-to-video knowledge transferring. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00634"},{"key":"14_CR17","doi-asserted-by":"crossref","unstructured":"Luo, D., Huang, J., Gong, S., Jin, H., Liu, Y.: Towards generalisable video moment retrieval: visual-dynamic injection to image-text pre-training. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02207"},{"key":"14_CR18","unstructured":"Mirshghallah, F., Taram, M., Vepakomma, P., Singh, A., Raskar, R., Esmaeilzadeh, H.: Privacy in deep learning: a survey. arXiv preprint arXiv:2004.12254 (2020)"},{"key":"14_CR19","doi-asserted-by":"crossref","unstructured":"Peng, B., Chen, X., Wang, Y., Lu, C., Qiao, Y.: Conditionvideo: training-free condition-guided text-to-video generation. In: AAAI (2024)","DOI":"10.1609\/aaai.v38i5.28244"},{"key":"14_CR20","unstructured":"Radford, A., Kim, J.W., Hallacy, C., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"14_CR21","doi-asserted-by":"crossref","unstructured":"Rasheed, H.A., Khattak, M.U., Maaz, M., Khan, S., Khan, F.S.: Fine-tuned clip models are efficient video learners. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00633"},{"key":"14_CR22","unstructured":"Ren, S., He, K., Girshick, R.B., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. TPAMI (2015)"},{"key":"14_CR23","doi-asserted-by":"crossref","unstructured":"Shi, J., Xu, J., Gong, B., Xu, C.: Not all frames are equal: weakly-supervised video grounding with contextual similarity and visual clustering losses. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.01069"},{"key":"14_CR24","doi-asserted-by":"crossref","unstructured":"Shtedritski, A., Rupprecht, C., Vedaldi, A.: What does clip know about a red circle? visual prompt engineering for vlms. In: CVPR (2023)","DOI":"10.1109\/ICCV51070.2023.01101"},{"key":"14_CR25","doi-asserted-by":"crossref","unstructured":"Su, R., Xu, Q.Y.D.: Stvgbert: a visual-linguistic transformer based framework for spatio-temporal video grounding. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00156"},{"key":"14_CR26","doi-asserted-by":"crossref","unstructured":"Subramanian, S., Merrill, W., Darrell, T., Gardner, M., Singh, S., Rohrbach, A.: Reclip: a strong zero-shot baseline for referring expression comprehension. In: ACL (2022)","DOI":"10.18653\/v1\/2022.acl-long.357"},{"key":"14_CR27","doi-asserted-by":"crossref","unstructured":"Tiong, A.M.H., Li, J., Li, B.A., Savarese, S., Hoi, S.C.H.: Plug-and-play VQA: Zero-shot VQA by conjoining large pretrained models with zero training. In: EMNLP Findings (2022)","DOI":"10.18653\/v1\/2022.findings-emnlp.67"},{"key":"14_CR28","doi-asserted-by":"crossref","unstructured":"Wang, Y., Zhang, J., Kan, M., Shan, S., Chen, X.: Self-supervised equivariant attention mechanism for weakly supervised semantic segmentation. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01229"},{"key":"14_CR29","doi-asserted-by":"crossref","unstructured":"Wasim, S.T., Naseer, M., Khan, S., Khan, F.S., Shah, M.: Vita-clip: video and text adaptive clip via multimodal prompting. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02206"},{"key":"14_CR30","unstructured":"Xing, J., Wang, M., Hou, X., Dai, G., Wang, J., Liu, Y.: Multimodal adaptation of clip for few-shot action recognition. In: CVPR (2023)"},{"key":"14_CR31","doi-asserted-by":"crossref","unstructured":"Yang, A., Miech, A., Sivic, J., Laptev, I., Schmid, C.: Tubedetr: spatio-temporal video grounding with transformers. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01595"},{"key":"14_CR32","doi-asserted-by":"crossref","unstructured":"Yu, H., Ding, S., Li, L., Wu, J.: Self-attentive clip hashing for unsupervised cross-modal retrieval. In: MM Asia (2022)","DOI":"10.1145\/3551626.3564945"},{"key":"14_CR33","doi-asserted-by":"crossref","unstructured":"Zhang, G., Liu, B., Zhu, T., Zhou, A., Zhou, W.: Visual privacy attacks and defenses in deep learning: a survey. Artif. Intell. Rev. (2022)","DOI":"10.1007\/s10462-021-10123-y"},{"key":"14_CR34","doi-asserted-by":"crossref","unstructured":"Zhang, R., Wang, S., Duan, Y., Tang, Y., Zhang, Y., Tan, Y.P.: Hoi-aware adaptive network for weakly-supervised action segmentation. IJCAI (2023)","DOI":"10.24963\/ijcai.2023\/191"},{"key":"14_CR35","unstructured":"Zhang, Y., Wei, Y., Jiang, D., Zhang, X., Zuo, W., Tian, Q.: Controlvideo: training-free controllable text-to-video generation. ArXiv (2023)"},{"key":"14_CR36","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Zhao, Z., Zhao, Y., Wang, Q., Liu, H., Gao, L.: Where does it exist: Spatio-temporal video grounding for multi-form sentences. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01068"},{"key":"14_CR37","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Zhao, Z., Lin, Z., Huai, B., Yuan, N. J..: Object-aware multi-branch relation networks for spatio-temporal video grounding. In: IJCAI (2021)","DOI":"10.24963\/ijcai.2020\/149"},{"key":"14_CR38","unstructured":"Tang, Z., et al.: Human-centric spatio-temporal video grounding with visual transformers. TCSVT (2021)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73010-8_14","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,9]],"date-time":"2024-11-09T14:05:13Z","timestamp":1731161113000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73010-8_14"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,10]]},"ISBN":["9783031730092","9783031730108"],"references-count":38,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73010-8_14","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,10]]},"assertion":[{"value":"10 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}