{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:49:52Z","timestamp":1777657792395,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":62,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475540","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T04:52:26Z","timestamp":1634532746000},"page":"4091-4099","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":53,"title":["Interventional Video Relation Detection"],"prefix":"10.1145","author":[{"given":"Yicong","family":"Li","sequence":"first","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xun","family":"Yang","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xindi","family":"Shang","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tat-Seng","family":"Chua","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"VSRN: Visual-Semantic Relation Network for Video Visual Relation inference","author":"Cao Qianwen","year":"2021"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2964315"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123428"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Tianshui Chen Weihao Yu Riquan Chen and Liang Lin. 2019. Knowledge embedded routing network for scene graph generation. In CVPR. 6163--6171.  Tianshui Chen Weihao Yu Riquan Chen and Liang Lin. 2019. Knowledge embedded routing network for scene graph generation. In CVPR. 6163--6171.","DOI":"10.1109\/CVPR.2019.00632"},{"key":"e_1_3_2_1_5_1","first-page":"1271","article-title":"Scalable Deep Hashing for Large- Scale Social Image Retrieval","volume":"29","author":"Cui H.","year":"2020","journal-title":"IEEE TIP"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Bo Dai Yuqi Zhang and Dahua Lin. 2017. Detecting visual relationships with deep relational networks. In CVPR. 3076--3086.  Bo Dai Yuqi Zhang and Dahua Lin. 2017. Detecting visual relationships with deep relational networks. In CVPR. 3076--3086.","DOI":"10.1109\/CVPR.2017.352"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.5244\/C.28.65"},{"key":"e_1_3_2_1_8_1","volume-title":"Multiple Hypothesis Video Relation Detection. In 2019 IEEE Fifth International Conference on Multimedia Big Data. IEEE, 287--291","author":"Di Donglin","year":"2019"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3059295"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Christoph Feichtenhofer Axel Pinz and Andrew Zisserman. 2017. Detect to Track and Track to Detect. In ICCV.  Christoph Feichtenhofer Axel Pinz and Andrew Zisserman. 2017. Detect to Track and Track to Detect. In ICCV.","DOI":"10.1109\/ICCV.2017.330"},{"key":"e_1_3_2_1_11_1","first-page":"767","article-title":"A Pairwise Attentive Adversarial Spatiotemporal Network for Cross- Domain Few-Shot Action Recognition","volume":"30","author":"Gao Zan","year":"2021","journal-title":"IEEE TIP"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3377876"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Georgia Gkioxari and Jitendra Malik. 2015. Finding action tubes. In CVPR. 759-- 768.  Georgia Gkioxari and Jitendra Malik. 2015. Finding action tubes. In CVPR. 759-- 768.","DOI":"10.1109\/CVPR.2015.7298676"},{"key":"e_1_3_2_1_14_1","unstructured":"Jiuxiang Gu Handong Zhao Zhe Lin Sheng Li Jianfei Cai and Mingyang Ling. 2019. Scene graph generation with external knowledge and image reconstruction. In CVPR. 1969--1978.  Jiuxiang Gu Handong Zhao Zhe Lin Sheng Li Jianfei Cai and Mingyang Ling. 2019. Scene graph generation with external knowledge and image reconstruction. In CVPR. 1969--1978."},{"key":"e_1_3_2_1_15_1","volume-title":"Prajit Ramachandran, Mohammad Babaeizadeh, Honghui Shi, Jianan Li, Shuicheng Yan, and Thomas S Huang.","author":"Han Wei","year":"2016"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3416276"},{"key":"e_1_3_2_1_17_1","first-page":"188","article-title":"Neighbourhood structure preserving cross-modal embedding for video hyperlinking","volume":"22","author":"Hao Yanbin","year":"2019","journal-title":"IEEE TMM"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413595"},{"key":"e_1_3_2_1_19_1","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR. 770--778.  Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR. 770--778."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"J. Johnson R. Krishna M. Stark L. Li D. A. Shamma M. S. Bernstein and L. Fei-Fei. 2015. Image retrieval using scene graphs. In CVPR. 3668--3678.  J. Johnson R. Krishna M. Stark L. Li D. A. Shamma M. S. Bernstein and L. Fei-Fei. 2015. Image retrieval using scene graphs. In CVPR. 3668--3678.","DOI":"10.1109\/CVPR.2015.7298990"},{"key":"e_1_3_2_1_22_1","unstructured":"Yikang Li Wanli Ouyang Bolei Zhou Kun Wang and Xiaogang Wang. 2017. Scene graph generation from objects phrases and region captions. In ICCV. 1261--1270.  Yikang Li Wanli Ouyang Bolei Zhou Kun Wang and Xiaogang Wang. 2017. Scene graph generation from objects phrases and region captions. In ICCV. 1261--1270."},{"key":"e_1_3_2_1_23_1","volume-title":"Vrr-vg: Refocusing visually-relevant relationships. In ICCV. 10403--10412.","author":"Liang Yuanzhi","year":"2019"},{"key":"e_1_3_2_1_24_1","unstructured":"Chenchen Liu Yang Jin Kehan Xu Guoqiang Gong and Yadong Mu. 2020. Beyond short-term snippet: Video relation detection with spatio-temporal global context. In CVPR. 10840--10849.  Chenchen Liu Yang Jin Kehan Xu Guoqiang Gong and Yadong Mu. 2020. Beyond short-term snippet: Video relation detection with spatio-temporal global context. In CVPR. 10840--10849."},{"key":"e_1_3_2_1_25_1","unstructured":"Mason Liu Menglong Zhu Marie White Yinxiao Li and Dmitry Kalenichenko. 2019. Looking Fast and Slow: Memory-Guided Mobile Video Object Detection. arXiv:1903.10172 [cs.CV]  Mason Liu Menglong Zhu Marie White Yinxiao Li and Dmitry Kalenichenko. 2019. Looking Fast and Slow: Memory-Guided Mobile Video Object Detection. arXiv:1903.10172 [cs.CV]"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018803"},{"key":"e_1_3_2_1_27_1","volume-title":"Dally","author":"Mao Huizi","year":"2019"},{"key":"e_1_3_2_1_28_1","volume-title":"Causal inference in statistics: A primer","author":"Pearl Judea"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.5555\/3238230"},{"key":"e_1_3_2_1_30_1","volume-title":"Modelling relations with prototypes for visual relation detection. Multimedia Tools and Applications","author":"Plesse Francois","year":"2020"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351058"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.5555\/2969239.2969250"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3323873.3325056"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123380"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3356082"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413764"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3356076"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Kaihua Tang Yulei Niu Jianqiang Huang Jiaxin Shi and Hanwang Zhang. 2020. Unbiased scene graph generation from biased training. In CVPR. 3716--3725.  Kaihua Tang Yulei Niu Jianqiang Huang Jiaxin Shi and Hanwang Zhang. 2020. Unbiased scene graph generation from biased training. In CVPR. 3716--3725.","DOI":"10.1109\/CVPR42600.2020.00377"},{"key":"e_1_3_2_1_39_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 10424--10433","author":"Hubert Tsai Yao-Hung","year":"2019"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Tan Wang Jianqiang Huang Hanwang Zhang and Qianru Sun. 2020. Visual commonsense r-cnn. In CVPR. 10760--10770.  Tan Wang Jianqiang Huang Hanwang Zhang and Qianru Sun. 2020. Visual commonsense r-cnn. In CVPR. 10760--10770.","DOI":"10.1109\/CVPR42600.2020.01077"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2923608"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351034"},{"key":"e_1_3_2_1_44_1","volume-title":"Visual relation grounding in videos","author":"Xiao Junbin"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Junbin Xiao Xindi Shang Angela Yao and Tat-Seng Chua. 2021. NExT-QA: Next Phase of Question-Answering to Explaining Temporal Actions. In CVPR.  Junbin Xiao Xindi Shang Angela Yao and Tat-Seng Chua. 2021. NExT-QA: Next Phase of Question-Answering to Explaining Temporal Actions. In CVPR.","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"e_1_3_2_1_46_1","unstructured":"Jifeng Dai Lu Yuan Yichen Wei Xizhou Zhu Yuwen Xiong. 2017. Deep Feature Flow for Video Recognition.  Jifeng Dai Lu Yuan Yichen Wei Xizhou Zhu Yuwen Xiong. 2017. Deep Feature Flow for Video Recognition."},{"key":"e_1_3_2_1_47_1","unstructured":"Danfei Xu Yuke Zhu Christopher B Choy and Li Fei-Fei. 2017. Scene graph generation by iterative message passing. In CVPR. 5410--5419.  Danfei Xu Yuke Zhu Christopher B Choy and Li Fei-Fei. 2017. Scene graph generation by iterative message passing. In CVPR. 5410--5419."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.5555\/3045118.3045336"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401151"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462823"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3331184.3331242"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413610"},{"key":"e_1_3_2_1_53_1","unstructured":"Xu Yang Hanwang Zhang and Jianfei Cai. 2020. Deconfounded image captioning: A causal retrospect. (2020).  Xu Yang Hanwang Zhang and Jianfei Cai. 2020. Deconfounded image captioning: A causal retrospect. (2020)."},{"key":"e_1_3_2_1_54_1","first-page":"2987","article-title":"Person reidentification via structural deep metric learning","volume":"30","author":"Yang Xun","year":"2018","journal-title":"IEEE TNNLS"},{"key":"e_1_3_2_1_55_1","unstructured":"Zhongqi Yue Hanwang Zhang Qianru Sun and Xian-Sheng Hua. 2020. Interventional Few-Shot Learning. In NeurIPS.  Zhongqi Yue Hanwang Zhang Qianru Sun and Xian-Sheng Hua. 2020. Interventional Few-Shot Learning. In NeurIPS."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"crossref","unstructured":"Rowan Zellers Mark Yatskar Sam Thomson and Yejin Choi. 2018. Neural motifs: Scene graph parsing with global context. In CVPR. 5831--5840.  Rowan Zellers Mark Yatskar Sam Thomson and Yejin Choi. 2018. Neural motifs: Scene graph parsing with global context. In CVPR. 5831--5840.","DOI":"10.1109\/CVPR.2018.00611"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","unstructured":"C. Zhang and J. Kim. 2019. Modeling Long- and Short-Term Temporal Context for Video Object Detection. In ICIP. 71--75. https:\/\/doi.org\/10.1109\/ICIP.2019.8802920  C. Zhang and J. Kim. 2019. Modeling Long- and Short-Term Temporal Context for Video Object Detection. In ICIP. 71--75. https:\/\/doi.org\/10.1109\/ICIP.2019.8802920","DOI":"10.1109\/ICIP.2019.8802920"},{"key":"e_1_3_2_1_58_1","unstructured":"Dong Zhang Hanwang Zhang Jinhui Tang Xiansheng Hua and Qianru Sun. 2020. Causal intervention for weakly-supervised semantic segmentation. In NeurIPS.  Dong Zhang Hanwang Zhang Jinhui Tang Xiansheng Hua and Qianru Sun. 2020. Causal intervention for weakly-supervised semantic segmentation. In NeurIPS."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3356080"},{"key":"e_1_3_2_1_60_1","first-page":"4643","article-title":"Deep Collaborative Multi-View Hashing for Large-Scale Image Search","volume":"29","author":"Zhu Lei","year":"2020","journal-title":"IEEE TIP"},{"key":"e_1_3_2_1_61_1","unstructured":"Xizhou Zhu Yujie Wang Jifeng Dai Lu Yuan and Yichen Wei. 2017. Flow-guided feature aggregation for video object detection. In ICCV. 408--417.  Xizhou Zhu Yujie Wang Jifeng Dai Lu Yuan and Yichen Wei. 2017. Flow-guided feature aggregation for video object detection. In ICCV. 408--417."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"crossref","unstructured":"Bohan Zhuang Lingqiao Liu Chunhua Shen and Ian Reid. 2017. Towards context-aware interaction recognition for visual relationship detection. In ICCV. 589--598.  Bohan Zhuang Lingqiao Liu Chunhua Shen and Ian Reid. 2017. Towards context-aware interaction recognition for visual relationship detection. In ICCV. 589--598.","DOI":"10.1109\/ICCV.2017.71"}],"event":{"name":"MM '21: ACM Multimedia Conference","location":"Virtual Event China","acronym":"MM '21","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475540","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475540","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:49:10Z","timestamp":1750193350000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475540"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":62,"alternative-id":["10.1145\/3474085.3475540","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475540","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}