{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T10:33:37Z","timestamp":1763202817837,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key Research & Development Project of China","award":["2021ZD0110700"],"award-info":[{"award-number":["2021ZD0110700"]}]},{"name":"the Fundamental Research Funds for the Central Universities","award":["226-2023-00048"],"award-info":[{"award-number":["226-2023-00048"]}]},{"name":"the National Natural Science Foundation of China","award":["U19B2043 61976185"],"award-info":[{"award-number":["U19B2043 61976185"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612024","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:30Z","timestamp":1698391650000},"page":"5153-5163","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["Triple Correlations-Guided Label Supplementation for Unbiased Video Scene Graph Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-3911-2910","authenticated-orcid":false,"given":"Wenqing","family":"Wang","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2450-6551","authenticated-orcid":false,"given":"Kaifeng","family":"Gao","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7037-1806","authenticated-orcid":false,"given":"Yawei","family":"Luo","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5359-4488","authenticated-orcid":false,"given":"Tao","family":"Jiang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4678-1936","authenticated-orcid":false,"given":"Fei","family":"Gao","sequence":"additional","affiliation":[{"name":"Zhejiang University of Technology, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7842-7616","authenticated-orcid":false,"given":"Jian","family":"Shao","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0951-1072","authenticated-orcid":false,"given":"Jianwen","family":"Sun","sequence":"additional","affiliation":[{"name":"Central China Normal University, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6142-9914","authenticated-orcid":false,"given":"Jun","family":"Xiao","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Vqa: Visual question answering. In ICCV. 2425--2433.","author":"Antol Stanislaw","year":"2015","unstructured":"Stanislaw Antol, Aishwarya Agrawal, Jiasen Lu, Margaret Mitchell, Dhruv Batra, C Lawrence Zitnick, and Devi Parikh. 2015. Vqa: Visual question answering. In ICCV. 2425--2433."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2020.12.029"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Joao Carreira and Andrew Zisserman. 2017. Quo vadis action recognition? a new model and the kinetics dataset. In CVPR. 6299--6308.","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_4_1","volume-title":"Snoek","author":"Chen Shuo","year":"2021","unstructured":"Shuo Chen, Zenglin Shi, Pascal Mettes, and Cees G. M. Snoek. 2021. Social Fabric: Tubelet Compositions for Video Relation Detection. In ICCV."},{"key":"e_1_3_2_1_5_1","unstructured":"Siqi Chen Jun Xiao and Long Chen. 2023 b. Video scene graph generation from single-frame weak supervision. In ICLR."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Tianshui Chen Weihao Yu Riquan Chen and Liang Lin. 2019. Knowledge-embedded routing network for scene graph generation. In CVPR. 6163--6171.","DOI":"10.1109\/CVPR.2019.00632"},{"key":"e_1_3_2_1_7_1","volume-title":"2023 a. Dark Knowledge Balance Learning for Unbiased Scene Graph Generation. arXiv","author":"Chen Zhiqing","year":"2023","unstructured":"Zhiqing Chen, Yawei Luo, Jian Shao, Yi Yang, Chunping Wang, Lei Chen, and Jun Xiao. 2023 a. Dark Knowledge Balance Learning for Unbiased Scene Graph Generation. arXiv (2023)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Yuren Cong Wentong Liao Hanno Ackermann Bodo Rosenhahn and Michael Ying Yang. 2021. Spatial-temporal transformer for dynamic scene graph generation. In ICCV. 16372--16382.","DOI":"10.1109\/ICCV48922.2021.01606"},{"key":"e_1_3_2_1_9_1","volume-title":"Dual encoding for video retrieval by text. TPAMI","author":"Dong Jianfeng","year":"2021","unstructured":"Jianfeng Dong, Xirong Li, Chaoxi Xu, Xun Yang, Gang Yang, Xun Wang, and Meng Wang. 2021. Dual encoding for video retrieval by text. TPAMI (2021)."},{"key":"e_1_3_2_1_10_1","volume-title":"Exploiting Long-Term Dependencies for Generating Dynamic Scene Graphs. arXiv","author":"Feng Shengyu","year":"2021","unstructured":"Shengyu Feng, Subarna Tripathi, Hesham Mostafa, Marcel Nassar, and Somdeb Majumdar. 2021. Exploiting Long-Term Dependencies for Generating Dynamic Scene Graphs. arXiv (2021)."},{"key":"e_1_3_2_1_11_1","volume-title":"Prajit Ramachandran, Mohammad Babaeizadeh, Honghui Shi, Jianan Li, Shuicheng Yan, and Thomas S Huang.","author":"Han Wei","year":"2016","unstructured":"Wei Han, Pooya Khorrami, Tom Le Paine, Prajit Ramachandran, Mohammad Babaeizadeh, Honghui Shi, Jianan Li, Shuicheng Yan, and Thomas S Huang. 2016. Seq-nms for video object detection. arXiv (2016)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR. 770--778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Jingwei Ji Ranjay Krishna Li Fei-Fei and Juan Carlos Niebles. 2020. Action genome: Actions as compositions of spatio-temporal scene graphs. In CVPR. 10236--10247.","DOI":"10.1109\/CVPR42600.2020.01025"},{"key":"e_1_3_2_1_14_1","volume-title":"Adam: A method for stochastic optimization. arXiv","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv (2014)."},{"key":"e_1_3_2_1_15_1","volume-title":"Label Semantic Knowledge Distillation for Unbiased Scene Graph Generation. arXiv","author":"Li Lin","year":"2022","unstructured":"Lin Li, Long Chen, Hanrong Shi, Wenxiao Wang, Jian Shao, Yi Yang, and Jun Xiao. 2022. Label Semantic Knowledge Distillation for Unbiased Scene Graph Generation. arXiv (2022)."},{"key":"e_1_3_2_1_16_1","unstructured":"Rongjie Li Songyang Zhang Bo Wan and Xuming He. 2021b. Bipartite Graph Network with Adaptive Message Passing for Unbiased Scene Graph Generation. In CVPR."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3289753"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Yicong Li Xun Yang Xindi Shang and Tat-Seng Chua. 2021a. Interventional video relation detection. In ACM MM. 4091--4099.","DOI":"10.1145\/3474085.3475540"},{"key":"e_1_3_2_1_19_1","volume-title":"Ppdm: Parallel point detection and matching for real-time human-object interaction detection. In CVPR. 482--490.","author":"Liao Yue","year":"2020","unstructured":"Yue Liao, Si Liu, Fei Wang, Yanjie Chen, Chen Qian, and Jiashi Feng. 2020. Ppdm: Parallel point detection and matching for real-time human-object interaction detection. In CVPR. 482--490."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2023.3284032"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3186740"},{"key":"e_1_3_2_1_22_1","unstructured":"Chenchen Liu Yang Jin Kehan Xu Guoqiang Gong and Yadong Mu. 2020. Beyond short-term snippet: Video relation detection with spatio-temporal global context. In CVPR. 10840--10849."},{"key":"e_1_3_2_1_23_1","unstructured":"Hengyue Liu Ning Yan Masood S Mortazavi and Bir Bhanu. 2021. Fully Convolutional Scene Graph Generation. In CVPR."},{"key":"e_1_3_2_1_24_1","volume-title":"Adversarial style mining for one-shot unsupervised domain adaptation. Advances in neural information processing systems","author":"Luo Yawei","year":"2020","unstructured":"Yawei Luo, Ping Liu, Tao Guan, Junqing Yu, and Yi Yang. 2020. Adversarial style mining for one-shot unsupervised domain adaptation. Advances in neural information processing systems, Vol. 33 (2020), 20612--20623."},{"key":"e_1_3_2_1_25_1","first-page":"3940","article-title":"Category-level adversarial adaptation for semantic segmentation using purified features","volume":"44","author":"Luo Yawei","year":"2021","unstructured":"Yawei Luo, Ping Liu, Liang Zheng, Tao Guan, Junqing Yu, and Yi Yang. 2021. Category-level adversarial adaptation for semantic segmentation using purified features. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 44, 8 (2021), 3940--3956.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00261"},{"key":"e_1_3_2_1_27_1","volume-title":"Heng Tao Shen, and Jingkuan Song","author":"Lyu Xinyu","year":"2022","unstructured":"Xinyu Lyu, Lianli Gao, Yuyu Guo, Zhou Zhao, Hao Huang, Heng Tao Shen, and Jingkuan Song. 2022. Fine-grained predicates learning for scene graph generation. In CVPR. 19467--19475."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Ishan Misra C Lawrence Zitnick Margaret Mitchell and Ross Girshick. 2016. Seeing through the human reporting bias: Visual classifiers from noisy human-centric labels. In CVPR. 2930--2939.","DOI":"10.1109\/CVPR.2016.320"},{"key":"e_1_3_2_1_29_1","unstructured":"Alejandro Newell and Jia Deng. 2017. Pixels to Graphs by Associative Embedding. In NIPS."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3276505"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Xufeng Qian Yueting Zhuang Yimeng Li Shaoning Xiao Shiliang Pu and Jun Xiao. 2019. Video relation detection with spatio-temporal graph. In ACM MM. 84--93.","DOI":"10.1145\/3343031.3351058"},{"key":"e_1_3_2_1_32_1","unstructured":"Shaoqing Ren Kaiming He Ross B Girshick and Jian Sun. 2015. Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. In NeurIPS."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Xindi Shang Donglin Di Junbin Xiao Yu Cao Xun Yang and Tat-Seng Chua. 2019. Annotating objects and relations in user-generated videos. In ICMR. 279--287.","DOI":"10.1145\/3323873.3325056"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"Xindi Shang Yicong Li Junbin Xiao Wei Ji and Tat-Seng Chua. 2021. Video Visual Relation Detection via Iterative Inference. In ACM MM. 3654--3663.","DOI":"10.1145\/3474085.3475263"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Xindi Shang Tongwei Ren Jingfan Guo Hanwang Zhang and Tat-Seng Chua. 2017. Video visual relation detection. In ACM MM. 1300--1308.","DOI":"10.1145\/3123266.3123380"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1561\/1500000014"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Zixuan Su Xindi Shang Jingjing Chen Yu-Gang Jiang Zhiyong Qiu and Tat-Seng Chua. 2020. Video Relation Detection via Multiple Hypothesis Association. In ACM MM. 3127--3135.","DOI":"10.1145\/3394171.3413764"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Xu Sun Tongwei Ren Yuan Zi and Gangshan Wu. 2019. Video visual relation detection via multi-modal feature fusion. In ACM MM. 2657--2661.","DOI":"10.1145\/3343031.3356076"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Christian Szegedy Sergey Ioffe Vincent Vanhoucke and Alexander A Alemi. 2017. Inception-v4 inception-resnet and the impact of residual connections on learning. In AAAI.","DOI":"10.1609\/aaai.v31i1.11231"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Kaihua Tang Yulei Niu Jianqiang Huang Jiaxin Shi and Hanwang Zhang. 2020. Unbiased scene graph generation from biased training. In CVPR. 3716--3725.","DOI":"10.1109\/CVPR42600.2020.00377"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Kaihua Tang Hanwang Zhang Baoyuan Wu Wenhan Luo and Wei Liu. 2019. Learning to compose dynamic tree structures for visual contexts. In CVPR. 6619--6628.","DOI":"10.1109\/CVPR.2019.00678"},{"key":"e_1_3_2_1_42_1","volume-title":"Movieqa: Understanding stories in movies through question-answering. In CVPR. 4631--4640.","author":"Tapaswi Makarand","year":"2016","unstructured":"Makarand Tapaswi, Yukun Zhu, Rainer Stiefelhagen, Antonio Torralba, Raquel Urtasun, and Sanja Fidler. 2016. Movieqa: Understanding stories in movies through question-answering. In CVPR. 4631--4640."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Yao Teng Limin Wang Zhifeng Li and Gangshan Wu. 2021. Target Adaptive Context Aggregation for Video Scene Graph Generation. In ICCV. 13688--13697.","DOI":"10.1109\/ICCV48922.2021.01343"},{"key":"e_1_3_2_1_44_1","unstructured":"Yao-Hung Hubert Tsai Santosh Divvala Louis-Philippe Morency Ruslan Salakhutdinov and Ali Farhadi. 2019. Video relationship reasoning using gated spatio-temporal energy graph. In CVPR. 10424--10433."},{"key":"e_1_3_2_1_45_1","volume-title":"What and When to Look?: Temporal Span Proposal Network for Video Visual Relation Detection. arXiv","author":"Woo Sangmin","year":"2021","unstructured":"Sangmin Woo, Junhyug Noh, and Kangil Kim. 2021. What and When to Look?: Temporal Span Proposal Network for Video Visual Relation Detection. arXiv (2021)."},{"key":"e_1_3_2_1_46_1","unstructured":"Danfei Xu Yuke Zhu Christopher B Choy and Li Fei-Fei. 2017. Scene graph generation by iterative message passing. In CVPR. 5410--5419."},{"volume-title":"Meta spatio-temporal debiasing for video scene graph generation","author":"Xu Li","key":"e_1_3_2_1_47_1","unstructured":"Li Xu, Haoxuan Qu, Jason Kuen, Jiuxiang Gu, and Jun Liu. 2022. Meta spatio-temporal debiasing for video scene graph generation. In ECCV. Springer, 374--390."},{"key":"e_1_3_2_1_48_1","volume-title":"Pcpl: Predicate-correlation perception learning for unbiased scene graph generation. In ACM MM. 265--273.","author":"Yan Shaotian","year":"2020","unstructured":"Shaotian Yan, Chen Shen, Zhongming Jin, Jianqiang Huang, Rongxin Jiang, Yaowu Chen, and Xian-Sheng Hua. 2020. Pcpl: Predicate-correlation perception learning for unbiased scene graph generation. In ACM MM. 265--273."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Rowan Zellers Mark Yatskar Sam Thomson and Yejin Choi. 2018. Neural motifs: Scene graph parsing with global context. In CVPR. 5831--5840.","DOI":"10.1109\/CVPR.2018.00611"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Sipeng Zheng Xiangyu Chen Shizhe Chen and Qin Jin. 2019. Relation understanding in videos. In ACM MM. 2662--2666.","DOI":"10.1145\/3343031.3356080"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612024","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612024","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:03:01Z","timestamp":1755820981000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612024"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":50,"alternative-id":["10.1145\/3581783.3612024","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612024","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}