{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,3]],"date-time":"2026-02-03T08:33:18Z","timestamp":1770107598452,"version":"3.49.0"},"reference-count":56,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Key Research and Development Program of China","award":["2023YFB3107401"],"award-info":[{"award-number":["2023YFB3107401"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["T2341003"],"award-info":[{"award-number":["T2341003"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62521002"],"award-info":[{"award-number":["62521002"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U2441240"],"award-info":[{"award-number":["U2441240"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U24B20185"],"award-info":[{"award-number":["U24B20185"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62376210"],"award-info":[{"award-number":["62376210"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62132011"],"award-info":[{"award-number":["62132011"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62406240"],"award-info":[{"award-number":["62406240"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans.Inform.Forensic Secur."],"published-print":{"date-parts":[[2026]]},"DOI":"10.1109\/tifs.2026.3657094","type":"journal-article","created":{"date-parts":[[2026,1,22]],"date-time":"2026-01-22T21:00:16Z","timestamp":1769115616000},"page":"1484-1496","source":"Crossref","is-referenced-by-count":0,"title":["Adversarial Video Promotion Against Text-to-Video Retrieval"],"prefix":"10.1109","volume":"21","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4575-292X","authenticated-orcid":false,"given":"Qiwei","family":"Tian","sequence":"first","affiliation":[{"name":"Faculty of Electronic and Information Engineering, Xi&#x2019;an Jiaotong University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6265-7345","authenticated-orcid":false,"given":"Chenhao","family":"Lin","sequence":"additional","affiliation":[{"name":"Faculty of Electronic and Information Engineering, Xi&#x2019;an Jiaotong University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0745-4294","authenticated-orcid":false,"given":"Zhengyu","family":"Zhao","sequence":"additional","affiliation":[{"name":"Faculty of Electronic and Information Engineering, Xi&#x2019;an Jiaotong University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0327-6729","authenticated-orcid":false,"given":"Shuai","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Cyber Science and Engineering, Faculty of Electronic and Information Engineering, Xi&#x2019;an Jiaotong University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1073-7810","authenticated-orcid":false,"given":"Qian","family":"Li","sequence":"additional","affiliation":[{"name":"Faculty of Electronic and Information Engineering, Xi&#x2019;an Jiaotong University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6959-0569","authenticated-orcid":false,"given":"Chao","family":"Shen","sequence":"additional","affiliation":[{"name":"Faculty of Electronic and Information Engineering, Xi&#x2019;an Jiaotong University, Xi&#x2019;an, China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Clip2tv: An empirical study on transformer-based methods for video-text retrieval","author":"Gao","year":"2021","journal-title":"arXiv:2111.05610"},{"key":"ref2","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref3","article-title":"CLIP-ViP: Adapting pre-trained image-text model to video-language representation alignment","author":"Xue","year":"2022","journal-title":"arXiv:2209.06430"},{"key":"ref4","first-page":"12888","article-title":"BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00244"},{"key":"ref6","first-page":"487","article-title":"Revealing single frame bias for video-and-language learning","volume-title":"Proc. 61st Annu. Meeting Assoc. Comput. Linguistics","author":"Lei"},{"key":"ref7","article-title":"Disentangled representation learning for text-video retrieval","author":"Wang","year":"2022","journal-title":"arXiv:2203.07111"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.01031"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019127"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448358"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547801"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00016"},{"key":"ref15","article-title":"A study of the effect of JPG compression on adversarial images","author":"Dziugaite","year":"2016","journal-title":"arXiv:1608.00853"},{"key":"ref16","doi-asserted-by":"crossref","first-page":"388","DOI":"10.1016\/j.neunet.2023.10.033","article-title":"Temporal shuffling for defending deep action recognition models against adversarial attacks","volume":"169","author":"Hwang","year":"2024","journal-title":"Neural Netw."},{"key":"ref17","article-title":"Efficient estimation of word representations in vector space","author":"Mikolov","year":"2013","journal-title":"arXiv:1301.3781"},{"key":"ref18","first-page":"2","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","volume-title":"Proc. NaacL-HLT","volume":"1","author":"Kenton"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1907.11692"},{"key":"ref20","article-title":"Improving language understanding by generative pre-training","author":"Radford","year":"2018"},{"key":"ref21","article-title":"An image is worth 16\u00d716 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020","journal-title":"arXiv:2010.11929"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"ref24","first-page":"13","article-title":"ViLBERT: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Lu"},{"key":"ref25","article-title":"UNITER: Universal image-text representation learning","author":"Chen","year":"2019","journal-title":"arXiv:1909.11740"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"ref27","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","author":"Li","year":"2021","journal-title":"arXiv:2107.07651"},{"key":"ref28","article-title":"BEiT: BERT pre-training of image transformers","author":"Bao","year":"2021","journal-title":"arXiv:2106.08254"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00877"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"ref32","first-page":"30291","article-title":"Expectation-maximization contrastive learning for compact video-and-language representations","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Jin"},{"key":"ref33","article-title":"Use what you have: Video retrieval using representations from collaborative experts","author":"Liu","year":"2019","journal-title":"arXiv:1907.13487"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-77004-4_1"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28327"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01622"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00334"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462887"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01610"},{"key":"ref41","article-title":"Attacking visual language grounding with adversarial examples: A case study on neural image captioning","author":"Chen","year":"2017","journal-title":"arXiv:1712.02051"},{"key":"ref42","article-title":"On evaluating adversarial robustness of large vision-language models","author":"Zhao","year":"2023","journal-title":"arXiv:2305.16934"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00335"},{"key":"ref44","article-title":"I see dead people: Gray-box adversarial attack on image-to-text models","author":"Lapid","year":"2023","journal-title":"arXiv:2306.07591"},{"key":"ref45","first-page":"52936","article-title":"VLATTACK: Multimodal adversarial attacks on vision-language tasks via pre-trained models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Yin"},{"key":"ref46","article-title":"SA-attack: Improving adversarial transferability of vision-language pre-training models via self-augmentation","author":"He","year":"2023","journal-title":"arXiv:2312.04913"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72998-0_25"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2025.3581476"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/TDSC.2025.3601232"},{"key":"ref50","first-page":"1665","article-title":"Gleam: Enhanced transferable adversarial attacks for vision-language pre-training models via global\u2013local transformations","volume-title":"Proc. IEEE\/CVF Int. Conf. Comput. Vis. (ICCV)","author":"Liu"},{"key":"ref51","article-title":"CLIP4Clip: An empirical study of CLIP for end to end video clip retrieval","author":"Luo","year":"2021","journal-title":"arXiv:2104.08860"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01569"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096984"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref56","article-title":"Towards deep learning models resistant to adversarial attacks","author":"Madry","year":"2017","journal-title":"arXiv:1706.06083"}],"container-title":["IEEE Transactions on Information Forensics and Security"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10206\/11313711\/11361157.pdf?arnumber=11361157","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,2]],"date-time":"2026-02-02T20:43:23Z","timestamp":1770065003000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11361157\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"references-count":56,"URL":"https:\/\/doi.org\/10.1109\/tifs.2026.3657094","relation":{},"ISSN":["1556-6013","1556-6021"],"issn-type":[{"value":"1556-6013","type":"print"},{"value":"1556-6021","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]}}}