{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T20:30:28Z","timestamp":1743021028658,"version":"3.40.3"},"publisher-location":"Cham","reference-count":88,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031729881"},{"type":"electronic","value":"9783031729898"}],"license":[{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72989-8_5","type":"book-chapter","created":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T17:02:04Z","timestamp":1729875724000},"page":"77-98","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Meta-optimized Angular Margin Contrastive Framework for\u00a0Video-Language Representation Learning"],"prefix":"10.1007","author":[{"given":"Thong","family":"Nguyen","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yi","family":"Bin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaobao","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xinshuai","family":"Dong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhiyuan","family":"Hu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Khoi","family":"Le","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cong-Duy","family":"Nguyen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"See-Kiong","family":"Ng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Luu Anh","family":"Tuan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,26]]},"reference":[{"key":"5_CR1","first-page":"24206","volume":"34","author":"H Akbari","year":"2021","unstructured":"Akbari, H., et al.: VATT: transformers for multimodal self-supervised learning from raw video, audio and text. Adv. Neural. Inf. Process. Syst. 34, 24206\u201324221 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"5_CR2","doi-asserted-by":"crossref","unstructured":"Anne\u00a0Hendricks, L., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.: Localizing moments in video with natural language. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5803\u20135812 (2017)","DOI":"10.1109\/ICCV.2017.618"},{"key":"5_CR3","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: Frozen in time: a joint video and image encoder for end-to-end retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1728\u20131738 (2021)","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"5_CR4","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: A clip-hitchhiker\u2019s guide to long video retrieval. arXiv preprint arXiv:2205.08508 (2022)"},{"key":"5_CR5","doi-asserted-by":"crossref","unstructured":"Bin, Y., et al.: Non-autoregressive math word problem solver with unified tree structure. In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pp. 3290\u20133301 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.199"},{"key":"5_CR6","doi-asserted-by":"crossref","unstructured":"Buch, S., Eyzaguirre, C., Gaidon, A., Wu, J., Fei-Fei, L., Niebles, J.C.: Revisiting the \u201cvideo\u201d in video-language understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2917\u20132927 (2022)","DOI":"10.1109\/CVPR52688.2022.00293"},{"key":"5_CR7","unstructured":"Chen, D., Dolan, W.B.: Collecting highly parallel data for paraphrase evaluation. In: Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies, pp. 190\u2013200 (2011)"},{"key":"5_CR8","doi-asserted-by":"crossref","unstructured":"Cheng, F., Wang, X., Lei, J., Crandall, D., Bansal, M., Bertasius, G.: VindLU: a recipe for effective video-and-language pretraining. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10739\u201310750 (2023)","DOI":"10.1109\/CVPR52729.2023.01034"},{"key":"5_CR9","doi-asserted-by":"crossref","unstructured":"Coria, J.M., Ghannay, S., Rosset, S., Bredin, H.: A metric learning approach to misogyny categorization. In: Proceedings of the 5th Workshop on Representation Learning for NLP, pp. 89\u201394 (2020)","DOI":"10.18653\/v1\/2020.repl4nlp-1.12"},{"issue":"48","key":"5_CR10","first-page":"7","volume":"24","author":"BC Cs\u00e1ji","year":"2001","unstructured":"Cs\u00e1ji, B.C., et al.: Approximation with artificial neural networks. Fac. Sci. ETVS Lornd Univ. Hungary 24(48), 7 (2001)","journal-title":"Fac. Sci. ETVS Lornd Univ. Hungary"},{"key":"5_CR11","unstructured":"Dai, W., et al.: InstructBLIP: Towards general-purpose vision-language models with instruction tuning. arxiv 2023. arXiv preprint arXiv:2305.06500"},{"key":"5_CR12","doi-asserted-by":"crossref","unstructured":"Deng, J., Guo, J., Xue, N., Zafeiriou, S.: ArcFace: additive angular margin loss for deep face recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4690\u20134699 (2019)","DOI":"10.1109\/CVPR.2019.00482"},{"key":"5_CR13","doi-asserted-by":"crossref","unstructured":"Fabian Caba\u00a0Heilbron, Victor\u00a0Escorcia, B.G., Niebles, J.C.: ActivityNet: a large-scale video benchmark for human activity understanding. In: The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"5_CR14","unstructured":"Fernando, D.l.T., Mkchael, J.B.: A framework for robust subspace learning. Int. J. Comput. Vis. 54(1), 117\u2013142 (2003)"},{"issue":"1","key":"5_CR15","doi-asserted-by":"publisher","first-page":"119","DOI":"10.1006\/jcss.1997.1504","volume":"55","author":"Y Freund","year":"1997","unstructured":"Freund, Y., Schapire, R.E.: A decision-theoretic generalization of on-line learning and an application to boosting. J. Comput. Syst. Sci. 55(1), 119\u2013139 (1997)","journal-title":"J. Comput. Syst. Sci."},{"key":"5_CR16","unstructured":"Fu, T.J., Li, L., Gan, Z., Lin, K., Wang, W.Y., Wang, L., Liu, Z.: VIOLET: End-to-end video-language transformers with masked visual-token modeling. arXiv preprint arXiv:2111.12681 (2021)"},{"key":"5_CR17","doi-asserted-by":"crossref","unstructured":"Fu, T.J., et al.: An empirical study of end-to-end video-language transformers with masked visual modeling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22898\u201322909 (2023)","DOI":"10.1109\/CVPR52729.2023.02193"},{"key":"5_CR18","unstructured":"Gao, Z., Liu, J., Sun, W., Chen, S., Chang, D., Zhao, L.: CLIP2TV: Align, match and distill for video-text retrieval. arXiv preprint arXiv:2111.05610 (2021)"},{"key":"5_CR19","doi-asserted-by":"crossref","unstructured":"Han, T., Xie, W., Zisserman, A.: Temporal alignment networks for long-term video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2906\u20132916 (2022)","DOI":"10.1109\/CVPR52688.2022.00292"},{"key":"5_CR20","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep Residual Learning for Image Recognition. In: The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"5_CR21","doi-asserted-by":"crossref","unstructured":"Jang, Y., Song, Y., Yu, Y., Kim, Y., Kim, G.: TGIF-QA: toward Spatio-temporal reasoning in visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2758\u20132766 (2017)","DOI":"10.1109\/CVPR.2017.149"},{"key":"5_CR22","unstructured":"Jiang, L., Meng, D., Yu, S.I., Lan, Z., Shan, S., Hauptmann, A.: Self-paced learning with diversity. In: Advances in Neural Information Processing Systems, vol. 27 (2014)"},{"key":"5_CR23","doi-asserted-by":"crossref","unstructured":"Jin, P., et al.: Video-text as game players: hierarchical banzhaf interaction for cross-modal representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2472\u20132482 (2023)","DOI":"10.1109\/CVPR52729.2023.00244"},{"key":"5_CR24","doi-asserted-by":"crossref","unstructured":"Krishna, R., Hata, K., Ren, F., Fei-Fei, L., Carlos\u00a0Niebles, J.: Dense-captioning events in videos. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 706\u2013715 (2017)","DOI":"10.1109\/ICCV.2017.83"},{"key":"5_CR25","doi-asserted-by":"crossref","unstructured":"Lei, C., et al.: Understanding Chinese video and language via contrastive multimodal pre-training. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 2567\u20132576 (2021)","DOI":"10.1145\/3474085.3475431"},{"key":"5_CR26","unstructured":"Lei, J., Berg, T.L., Bansal, M.: Revealing single frame bias for video-and-language learning. arXiv preprint arXiv:2206.03428 (2022)"},{"key":"5_CR27","doi-asserted-by":"crossref","unstructured":"Lei, J., et al.: Less is more: ClipBERT for video-and-language learning via sparse sampling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7331\u20137341 (2021)","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"5_CR28","doi-asserted-by":"crossref","unstructured":"Li, B., Han, Z., Li, H., Fu, H., Zhang, C.: Trustworthy long-tailed classification. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6970\u20136979 (2022)","DOI":"10.1109\/CVPR52688.2022.00684"},{"key":"5_CR29","doi-asserted-by":"crossref","unstructured":"Li, H., Bin, Y., Liao, J., Yang, Y., Shen, H.T.: Your negative may not be true negative: boosting image-text matching with false negative elimination. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 924\u2013934 (2023)","DOI":"10.1145\/3581783.3612101"},{"key":"5_CR30","doi-asserted-by":"crossref","unstructured":"Li, J., Niu, L., Zhang, L.: From representation to reasoning: towards both evidence and commonsense reasoning for video question-answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21273\u201321282 (2022)","DOI":"10.1109\/CVPR52688.2022.02059"},{"key":"5_CR31","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"5_CR32","doi-asserted-by":"crossref","unstructured":"Li, L., Chen, Y.C., Cheng, Y., Gan, Z., Yu, L., Liu, J.: Hero: Hierarchical encoder for video+ language omni-representation pre-training. arXiv preprint arXiv:2005.00200 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.161"},{"key":"5_CR33","doi-asserted-by":"crossref","unstructured":"Li, L., et al.: LAVENDER: unifying video-language understanding as masked language modeling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23119\u201323129 (2023)","DOI":"10.1109\/CVPR52729.2023.02214"},{"key":"5_CR34","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Goyal, P., Girshick, R., He, K., Doll\u00e1r, P.: Focal loss for dense object detection. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2980\u20132988 (2017)","DOI":"10.1109\/ICCV.2017.324"},{"key":"5_CR35","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Goyal, P., Girshick, R., He, K., Doll\u00e1r, P.: Focal loss for dense object detection. IEEE Trans. Pattern Anal. Mach. Intell. 42(2), 318\u2013327 (2018)","DOI":"10.1109\/TPAMI.2018.2858826"},{"key":"5_CR36","doi-asserted-by":"publisher","unstructured":"Lin, Y.B., Lei, J., Bansal, M., Bertasius, G.: EclipSE: efficient long-range video retrieval using sight and sound. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision - ECCV 2022. ECCV 2022. LNCS, vol. 13694. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19830-4_24","DOI":"10.1007\/978-3-031-19830-4_24"},{"key":"5_CR37","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. arXiv preprint arXiv:2304.08485 (2023)"},{"key":"5_CR38","unstructured":"Liu, Z., et al.: Video swin transformer. arXiv preprint arXiv:2106.13230 (2021)"},{"key":"5_CR39","unstructured":"Luo, H., et al.: UniVL: A unified video and language pre-training model for multimodal understanding and generation. arXiv preprint arXiv:2002.06353 (2020)"},{"key":"5_CR40","doi-asserted-by":"publisher","first-page":"293","DOI":"10.1016\/j.neucom.2022.07.028","volume":"508","author":"H Luo","year":"2022","unstructured":"Luo, H., et al.: Clip4Clip: an empirical study of clip for end to end video clip retrieval and captioning. Neurocomputing 508, 293\u2013304 (2022)","journal-title":"Neurocomputing"},{"key":"5_CR41","doi-asserted-by":"crossref","unstructured":"Ma, X., Santos, C.N.d., Arnold, A.O.: Contrastive fine-tuning improves robustness for neural rankers. arXiv preprint arXiv:2105.12932 (2021)","DOI":"10.18653\/v1\/2021.findings-acl.51"},{"key":"5_CR42","doi-asserted-by":"crossref","unstructured":"Malisiewicz, T., Gupta, A., Efros, A.A.: Ensemble of exemplar-SVMs for object detection and beyond. In: ICCV (2011)","DOI":"10.1109\/ICCV.2011.6126229"},{"issue":"5s","key":"5_CR43","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3585388","volume":"19","author":"X Man","year":"2023","unstructured":"Man, X., Shao, J., Chen, F., Zhang, M., Shen, H.T.: TEVL: trilinear encoder for video-language representation learning. ACM Trans. Multimed. Comput. Commun. Appl. 19(5s), 1\u201320 (2023)","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"5_CR44","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J.B., Tapaswi, M., Laptev, I., Sivic, J.: HowTo100M: Learning a text-video embedding by watching hundred million narrated video clips. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2630\u20132640 (2019)","DOI":"10.1109\/ICCV.2019.00272"},{"key":"5_CR45","doi-asserted-by":"crossref","unstructured":"Nguyen, C.D., Nguyen, T., Vu, D., Luu, A.: Improving multimodal sentiment analysis: supervised angular margin-based contrastive learning for enhanced fusion representation. In: Findings of the Association for Computational Linguistics: EMNLP 2023, pp. 14714\u201314724 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.980"},{"key":"5_CR46","doi-asserted-by":"crossref","unstructured":"Nguyen, C.D., Nguyen, T., Wu, X., Luu, A.T.: KDMCSE: knowledge distillation multimodal sentence embeddings with adaptive angular margin contrastive learning. arXiv preprint arXiv:2403.17486 (2024)","DOI":"10.18653\/v1\/2024.naacl-long.42"},{"key":"5_CR47","doi-asserted-by":"crossref","unstructured":"Nguyen, T., et al.: Video-language understanding: A survey from model architecture, model training, and data perspectives. arXiv preprint arXiv:2406.05615 (2024)","DOI":"10.18653\/v1\/2024.findings-acl.217"},{"key":"5_CR48","first-page":"11974","volume":"34","author":"T Nguyen","year":"2021","unstructured":"Nguyen, T., Luu, A.T.: Contrastive learning for neural topic model. Adv. Neural. Inf. Process. Syst. 34, 11974\u201311986 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"5_CR49","doi-asserted-by":"crossref","unstructured":"Nguyen, T., et al.: READ-PVLA: recurrent adapter with partial video-language alignment for parameter-efficient transfer learning in low-resource video-language modeling. In: Proceedings of the AAAI Conference on Artificial Intelligence. vol.\u00a038, pp. 18824\u201318832 (2024)","DOI":"10.1609\/aaai.v38i17.29847"},{"key":"5_CR50","doi-asserted-by":"crossref","unstructured":"Nguyen, T., Wu, X., Dong, X., Nguyen, C.D., Ng, S.K., Luu, A.: DemaFormer: damped exponential moving average transformer with energy-based modeling for temporal language grounding. In: Findings of the Association for Computational Linguistics: EMNLP 2023, pp. 3635\u20133649 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.235"},{"key":"5_CR51","unstructured":"Nguyen, T., Wu, X., Dong, X., Nguyen, C.D.T., Ng, S.K., Luu, A.T.: Topic modeling as multi-objective contrastive optimization. arXiv preprint arXiv:2402.07577 (2024)"},{"key":"5_CR52","doi-asserted-by":"crossref","unstructured":"Nguyen, T., Wu, X., Luu, A.T., Nguyen, C.D., Hai, Z., Bing, L.: Adaptive contrastive learning on multimodal transformer for review helpfulness predictions. arXiv preprint arXiv:2211.03524 (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.686"},{"key":"5_CR53","doi-asserted-by":"crossref","unstructured":"Peng, L., Yang, S., Bin, Y., Wang, G.: Progressive graph attention network for video question answering. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 2871\u20132879 (2021)","DOI":"10.1145\/3474085.3475193"},{"key":"5_CR54","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763 (2021)"},{"key":"5_CR55","unstructured":"Ren, M., Zeng, W., Yang, B., Urtasun, R.: Learning to reweight examples for robust deep learning. In: International Conference on Machine Learning, pp. 4334\u20134343. PMLR (2018)"},{"key":"5_CR56","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: Advances in Neural Information Processing Systems, vol. 28 (2015)"},{"key":"5_CR57","doi-asserted-by":"crossref","unstructured":"Seo, P.H., Nagrani, A., Arnab, A., Schmid, C.: End-to-end generative pretraining for multimodal video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17959\u201317968 (2022)","DOI":"10.1109\/CVPR52688.2022.01743"},{"key":"5_CR58","doi-asserted-by":"crossref","unstructured":"Shang, X., Di, D., Xiao, J., Cao, Y., Yang, X., Chua, T.S.: Annotating objects and relations in user-generated videos. In: Proceedings of the 2019 on International Conference on Multimedia Retrieval, pp. 279\u2013287 (2019)","DOI":"10.1145\/3323873.3325056"},{"key":"5_CR59","unstructured":"Shu, J., et al.: Meta-weight-net: learning an explicit mapping for sample weighting. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"5_CR60","unstructured":"Sohn, K.: Improved deep metric learning with multi-class n-pair loss objective. In: Advances in Neural Information Processing Systems, vol. 29 (2016)"},{"key":"5_CR61","unstructured":"Sun, C., Baradel, F., Murphy, K., Schmid, C.: Learning video representations using contrastive bidirectional transformer. arXiv preprint arXiv:1906.05743 (2019)"},{"key":"5_CR62","doi-asserted-by":"crossref","unstructured":"Sun, C., Myers, A., Vondrick, C., Murphy, K., Schmid, C.: VideoBERT: a joint model for video and language representation learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7464\u20137473 (2019)","DOI":"10.1109\/ICCV.2019.00756"},{"issue":"12","key":"5_CR63","doi-asserted-by":"publisher","first-page":"3358","DOI":"10.1016\/j.patcog.2007.04.009","volume":"40","author":"Y Sun","year":"2007","unstructured":"Sun, Y., Kamel, M.S., Wong, A.K., Wang, Y.: Cost-sensitive boosting for classification of imbalanced data. Pattern Recogn. 40(12), 3358\u20133378 (2007)","journal-title":"Pattern Recogn."},{"key":"5_CR64","doi-asserted-by":"crossref","unstructured":"Tang, Z., Lei, J., Bansal, M.: DeCEMBERT: learning from noisy instructional videos via dense captions and entropy minimization. In: Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 2415\u20132426 (2021)","DOI":"10.18653\/v1\/2021.naacl-main.193"},{"key":"5_CR65","doi-asserted-by":"publisher","first-page":"489","DOI":"10.1109\/TIP.2019.2931534","volume":"29","author":"A Wang","year":"2019","unstructured":"Wang, A., Luu, A.T., Foo, C.S., Zhu, H., Tay, Y., Chandrasekhar, V.: Holistic multi-modal memory network for movie question answering. IEEE Trans. Image Process. 29, 489\u2013499 (2019)","journal-title":"IEEE Trans. Image Process."},{"key":"5_CR66","doi-asserted-by":"crossref","unstructured":"Wang, J., et\u00a0al.: All in one: exploring unified video-language pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6598\u20136608 (2023)","DOI":"10.1109\/CVPR52729.2023.00638"},{"key":"5_CR67","first-page":"5696","volume":"35","author":"J Wang","year":"2022","unstructured":"Wang, J., et al.: OmniVL: one foundation model for image-language and video-language tasks. Adv. Neural. Inf. Process. Syst. 35, 5696\u20135710 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"5_CR68","unstructured":"Wang, Q., Zhang, Y., Zheng, Y., Pan, P., Hua, X.S.: Disentangled representation learning for text-video retrieval. arXiv preprint arXiv:2203.07111 (2022)"},{"key":"5_CR69","unstructured":"Wang, Y., Kucukelbir, A., Blei, D.M.: Robust probabilistic modeling with bayesian data reweighting. In: International Conference on Machine Learning, pp. 3646\u20133655. PMLR (2017)"},{"key":"5_CR70","doi-asserted-by":"crossref","unstructured":"Wei, J., Hu, G., Tuan, L.A., Yang, X., Zhu, W.: Multi-scale receptive field graph model for emotion recognition in conversations. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10094596"},{"key":"5_CR71","doi-asserted-by":"crossref","unstructured":"Wei, J., Hu, G., Yang, X., Luu, A.T., Dong, Y.: Audio-visual domain adaptation feature fusion for speech emotion recognition. In: INTERSPEECH, pp. 1988\u20131992 (2022)","DOI":"10.21437\/Interspeech.2022-703"},{"key":"5_CR72","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.121419","volume":"237","author":"J Wei","year":"2024","unstructured":"Wei, J., Hu, G., Yang, X., Luu, A.T., Dong, Y.: Learning facial expression and body gesture visual information for video emotion recognition. Expert Syst. Appl. 237, 121419 (2024)","journal-title":"Expert Syst. Appl."},{"key":"5_CR73","doi-asserted-by":"crossref","unstructured":"Wu, X., Dong, X., Nguyen, T., Liu, C., Pan, L.M., Luu, A.T.: InfoCTM: a mutual information maximization perspective of cross-lingual topic modeling. In: Proceedings of the AAAI Conference on Artificial Intelligence. vol.\u00a037, pp. 13763\u201313771 (2023)","DOI":"10.1609\/aaai.v37i11.26612"},{"key":"5_CR74","doi-asserted-by":"crossref","unstructured":"Wu, X., Dong, X., Pan, L., Nguyen, T., Luu, A.T.: Modeling dynamic topics in chain-free fashion by evolution-tracking contrastive learning and unassociated word exclusion. In: Findings of the Association for Computational Linguistics: ACL 2024. Association for Computational Linguistics (2024)","DOI":"10.18653\/v1\/2024.findings-acl.183"},{"key":"5_CR75","doi-asserted-by":"crossref","unstructured":"Wu, X., Li, C., Zhu, Y., Miao, Y.: Short text topic modeling with topic distribution quantization and negative sampling decoder. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 1772\u20131782. Online (2020). https:\/\/aclanthology.org\/2020.emnlp-main.138.pdf","DOI":"10.18653\/v1\/2020.emnlp-main.138"},{"key":"5_CR76","doi-asserted-by":"crossref","unstructured":"Wu, X., Luu, A.T., Dong, X.: Mitigating data sparsity for short text topic modeling by topic-semantic contrastive learning. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing. pp. 2748\u20132760. Association for Computational Linguistics, Abu Dhabi, United Arab Emirates (2022). https:\/\/aclanthology.org\/2022.emnlp-main.176","DOI":"10.18653\/v1\/2022.emnlp-main.176"},{"key":"5_CR77","doi-asserted-by":"crossref","unstructured":"Xiao, J., Shang, X., Yao, A., Chua, T.S.: Next-qa: next phase of question-answering to explaining temporal actions. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 9777\u20139786 (2021)","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"5_CR78","doi-asserted-by":"crossref","unstructured":"Xiao, J., et al.: Contrastive video question answering via video graph transformer. arXiv preprint arXiv:2302.13668 (2023)","DOI":"10.1109\/TPAMI.2023.3292266"},{"key":"5_CR79","doi-asserted-by":"crossref","unstructured":"Xu, H., et al.: VLM: Task-agnostic video-language model pre-training for video understanding. arXiv preprint arXiv:2105.09996 (2021)","DOI":"10.18653\/v1\/2021.findings-acl.370"},{"key":"5_CR80","doi-asserted-by":"crossref","unstructured":"Xu, H., et al.: VideoCLIP: Contrastive pre-training for zero-shot video-text understanding. arXiv preprint arXiv:2109.14084 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"5_CR81","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: MSR-VTT: a large video description dataset for bridging video and language. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5288\u20135296 (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"5_CR82","unstructured":"Xue, H., et al.: CLIP-ViP: Adapting pre-trained image-text model to video-language representation alignment. arXiv preprint arXiv:2209.06430 (2022)"},{"key":"5_CR83","doi-asserted-by":"crossref","unstructured":"Yang, A., et al.: Vid2Seq: large-scale pretraining of a visual language model for dense video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10714\u201310726 (2023)","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"5_CR84","doi-asserted-by":"crossref","unstructured":"Yu, T., Dai, W., Liu, Z., Fung, P.: Vision guided generative pre-trained language models for multimodal abstractive summarization. arXiv preprint arXiv:2109.02401 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.326"},{"key":"5_CR85","first-page":"23634","volume":"34","author":"R Zellers","year":"2021","unstructured":"Zellers, R., et al.: MERLOT: multimodal neural script knowledge models. Adv. Neural. Inf. Process. Syst. 34, 23634\u201323651 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"5_CR86","unstructured":"Zhang, Z., Sabuncu, M.R.: Generalized cross entropy loss for training deep neural networks with noisy labels. In: NeurIPS (2018)"},{"key":"5_CR87","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Misra, I., Kr\u00e4henb\u00fchl, P., Girdhar, R.: Learning video representations from large language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6586\u20136597 (2023)","DOI":"10.1109\/CVPR52729.2023.00637"},{"key":"5_CR88","doi-asserted-by":"crossref","unstructured":"Zhu, L., Yang, Y.: ActBERT: learning global-local video-text representations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8746\u20138755 (2020)","DOI":"10.1109\/CVPR42600.2020.00877"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72989-8_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T17:10:34Z","timestamp":1729876234000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72989-8_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,26]]},"ISBN":["9783031729881","9783031729898"],"references-count":88,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72989-8_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,26]]},"assertion":[{"value":"26 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}