{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,28]],"date-time":"2025-10-28T05:57:10Z","timestamp":1761631030357,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,6,12]],"date-time":"2023-06-12T00:00:00Z","timestamp":1686528000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,6,12]]},"DOI":"10.1145\/3591106.3592267","type":"proceedings-article","created":{"date-parts":[[2023,6,8]],"date-time":"2023-06-08T22:33:38Z","timestamp":1686263618000},"page":"39-47","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Multi-Label Meta Weighting for Long-Tailed Dynamic Scene Graph Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-6092-8104","authenticated-orcid":false,"given":"Shuo","family":"Chen","sequence":"first","affiliation":[{"name":"VIS Lab, University of Amsterdam, Netherlands"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7537-6457","authenticated-orcid":false,"given":"Yingjun","family":"Du","sequence":"additional","affiliation":[{"name":"VIS Lab, University of Amsterdam, Netherlands"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9275-5942","authenticated-orcid":false,"given":"Pascal","family":"Mettes","sequence":"additional","affiliation":[{"name":"VIS Lab, P.S.M.Mettes@uva.nl, Netherlands"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9092-1556","authenticated-orcid":false,"given":"Cees G.M.","family":"Snoek","sequence":"additional","affiliation":[{"name":"VIS Lab, University of Amsterdam, Netherlands"}]}],"member":"320","published-online":{"date-parts":[[2023,6,12]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"VQA: Visual Question Answering","author":"Antol Stanislaw","year":"2015","unstructured":"Stanislaw Antol, Aishwarya Agrawal, Jiasen Lu, Margaret Mitchell, Dhruv Batra, C.\u00a0Lawrence Zitnick, and Devi Parikh. 2015. VQA: Visual Question Answering. In ICCV. IEEE Computer Society, Santiago, Chile, 2425\u20132433."},{"volume-title":"EvoGrad: Efficient Gradient-Based Meta-Learning and Hyperparameter Optimization","author":"Bohdal Ondrej","unstructured":"Ondrej Bohdal, Yongxin Yang, and Timothy\u00a0M. Hospedales. 2021. EvoGrad: Efficient Gradient-Based Meta-Learning and Hyperparameter Optimization. In NeurIPS. Neural Information Processing Systems Foundation, Virtual, 22234\u201322246.","key":"e_1_3_2_1_2_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_3_1","DOI":"10.1016\/j.neucom.2020.12.029"},{"doi-asserted-by":"crossref","unstructured":"Shuo Chen Pascal Mettes Tao Hu and Cees\u00a0GM Snoek. 2020. Interactivity Proposals for Surveillance Videos. In ICMR. ACM Dublin Ireland 108\u2013116.","key":"e_1_3_2_1_4_1","DOI":"10.1145\/3372278.3390680"},{"volume-title":"Diagnosing Errors in Video Relation Detectors","author":"Chen Shuo","unstructured":"Shuo Chen, Pascal Mettes, and Cees\u00a0GM Snoek. 2021. Diagnosing Errors in Video Relation Detectors. In BMVC. BMVA Press, Online, 241.","key":"e_1_3_2_1_5_1"},{"key":"e_1_3_2_1_6_1","volume-title":"Social Fabric: Tubelet Compositions for Video Relation Detection","author":"Chen Shuo","year":"2021","unstructured":"Shuo Chen, Zenglin Shi, Pascal Mettes, and Cees\u00a0GM Snoek. 2021. Social Fabric: Tubelet Compositions for Video Relation Detection. In ICCV. IEEE, Montreal, QC, Canada, 13465\u201313474."},{"volume-title":"Spatial-temporal transformer for dynamic scene graph generation","author":"Cong Yuren","unstructured":"Yuren Cong, Wentong Liao, Hanno Ackermann, Bodo Rosenhahn, and Michael\u00a0Ying Yang. 2021. Spatial-temporal transformer for dynamic scene graph generation. In ICCV. IEEE, Montreal, QC, Canada, 16352\u201316362.","key":"e_1_3_2_1_7_1"},{"volume-title":"Learning of visual relations: The devil is in the tails","author":"Desai Alakh","unstructured":"Alakh Desai, Tz-Ying Wu, Subarna Tripathi, and Nuno Vasconcelos. 2021. Learning of visual relations: The devil is in the tails. In ICCV. IEEE, Montreal, QC, Canada, 15384\u201315393.","key":"e_1_3_2_1_8_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_9_1","DOI":"10.1109\/CVPR52688.2022.01882"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_10_1","DOI":"10.1109\/CVPR52688.2022.01889"},{"volume-title":"Deep residual learning for image recognition","author":"He Kaiming","unstructured":"Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR. IEEE Computer Society, Las Vegas, NV, USA, 770\u2013778.","key":"e_1_3_2_1_11_1"},{"volume-title":"Action genome: Actions as compositions of spatio-temporal scene graphs","author":"Ji Jingwei","unstructured":"Jingwei Ji, Ranjay Krishna, Li Fei-Fei, and Juan\u00a0Carlos Niebles. 2020. Action genome: Actions as compositions of spatio-temporal scene graphs. In CVPR. Computer Vision Foundation \/ IEEE, Seattle, WA, USA, 10233\u201310244.","key":"e_1_3_2_1_12_1"},{"volume-title":"Image retrieval using scene graphs","author":"Johnson Justin","unstructured":"Justin Johnson, Ranjay Krishna, Michael Stark, Li-Jia Li, David Shamma, Michael Bernstein, and Li Fei-Fei. 2015. Image retrieval using scene graphs. In CVPR. IEEE Computer Society, Boston, MA, USA, 3668\u20133678.","key":"e_1_3_2_1_13_1"},{"volume-title":"Learning Interactions and Relationships Between Movie Characters","author":"Kukleva Anna","unstructured":"Anna Kukleva, Makarand Tapaswi, and Ivan Laptev. 2020. Learning Interactions and Relationships Between Movie Characters. In CVPR. Computer Vision Foundation \/ IEEE, Seattle, WA, USA, 9846\u20139855.","key":"e_1_3_2_1_14_1"},{"volume-title":"Bipartite graph network with adaptive message passing for unbiased scene graph generation","author":"Li Rongjie","unstructured":"Rongjie Li, Songyang Zhang, Bo Wan, and Xuming He. 2021. Bipartite graph network with adaptive message passing for unbiased scene graph generation. In CVPR. Computer Vision Foundation \/ IEEE, Virtual, 11109\u201311119.","key":"e_1_3_2_1_15_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_16_1","DOI":"10.1109\/CVPR52688.2022.01884"},{"volume-title":"Dynamic Scene Graph Generation via Anticipatory Pre-Training","author":"Li Yiming","unstructured":"Yiming Li, Xiaoshan Yang, and Changsheng Xu. 2022. Dynamic Scene Graph Generation via Anticipatory Pre-Training. In CVPR. IEEE, New Orleans, LA, USA, 13864\u201313873.","key":"e_1_3_2_1_17_1"},{"volume-title":"Focal Loss for Dense Object Detection","author":"Lin Tsung-Yi","unstructured":"Tsung-Yi Lin, Priya Goyal, Ross\u00a0B. Girshick, Kaiming He, and Piotr Doll\u00e1r. 2017. Focal Loss for Dense Object Detection. In ICCV. IEEE Computer Society, Venice, Italy, 2999\u20133007.","key":"e_1_3_2_1_18_1"},{"volume-title":"Microsoft COCO: Common objects in context","author":"Lin Tsung-Yi","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C\u00a0Lawrence Zitnick. 2014. Microsoft COCO: Common objects in context. In ECCV. Springer, Zurich, Switzerland, 740\u2013755.","key":"e_1_3_2_1_19_1"},{"volume-title":"Beyond Short-Term Snippet: Video Relation Detection with Spatio-Temporal Global Context","author":"Liu Chenchen","unstructured":"Chenchen Liu, Yang Jin, Kehan Xu, Guoqiang Gong, and Yadong Mu. 2020. Beyond Short-Term Snippet: Video Relation Detection with Spatio-Temporal Global Context. In CVPR. Computer Vision Foundation \/ IEEE, Seattle, WA, USA, 10837\u201310846.","key":"e_1_3_2_1_20_1"},{"volume-title":"Fully convolutional scene graph generation","author":"Liu Hengyue","unstructured":"Hengyue Liu, Ning Yan, Masood Mortazavi, and Bir Bhanu. 2021. Fully convolutional scene graph generation. In CVPR. Computer Vision Foundation \/ IEEE, Virtual, 11546\u201311556.","key":"e_1_3_2_1_21_1"},{"volume-title":"ICLR. OpenReview.net","author":"Loshchilov Ilya","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled weight decay regularization. In ICLR. OpenReview.net, New Orleans, LA, USA.","key":"e_1_3_2_1_22_1"},{"volume-title":"ACM MM","author":"Qian Xufeng","unstructured":"Xufeng Qian, Yueting Zhuang, Yimeng Li, Shaoning Xiao, Shiliang Pu, and Jun Xiao. 2019. Video relation detection with spatio-temporal graph. In ACM MM. ACM, Nice, France, 84\u201393.","key":"e_1_3_2_1_23_1"},{"volume-title":"ICML. PMLR","author":"Ren Mengye","unstructured":"Mengye Ren, Wenyuan Zeng, Bin Yang, and Raquel Urtasun. 2018. Learning to reweight examples for robust deep learning. In ICML. PMLR, Stockholmsm\u00e4ssan, Stockholm, Sweden, 4331\u20134340.","key":"e_1_3_2_1_24_1"},{"volume-title":"Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks","author":"Ren Shaoqing","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2015. Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. In NeurIPS. Neural Information Processing Systems Foundation, Montreal, Quebec, Canada, 91\u201399.","key":"e_1_3_2_1_25_1"},{"volume-title":"ICMR. ACM","author":"Shang Xindi","unstructured":"Xindi Shang, Donglin Di, Junbin Xiao, Yu Cao, Xun Yang, and Tat-Seng Chua. 2019. Annotating objects and relations in user-generated videos. In ICMR. ACM, Ottawa, ON, Canada, 279\u2013287.","key":"e_1_3_2_1_26_1"},{"volume-title":"ACM MM","author":"Shang Xindi","unstructured":"Xindi Shang, Tongwei Ren, Jingfan Guo, Hanwang Zhang, and Tat-Seng Chua. 2017. Video visual relation detection. In ACM MM. ACM, Mountain View, CA, USA, 1300\u20131308.","key":"e_1_3_2_1_27_1"},{"key":"e_1_3_2_1_28_1","volume-title":"BC","author":"Shu Jun","year":"2019","unstructured":"Jun Shu, Qi Xie, Lixuan Yi, Qian Zhao, Sanping Zhou, Zongben Xu, and Deyu Meng. 2019. Meta-weight-net: Learning an explicit mapping for sample weighting. In NeurIPS. Neural Information Processing Systems Foundation, Vancouver, BC, Canada, 1917\u20131928."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_29_1","DOI":"10.1561\/1500000014"},{"doi-asserted-by":"crossref","unstructured":"Zixuan Su Xindi Shang Jingjing Chen Yu-Gang Jiang Zhiyong Qiu and Tat-Seng Chua. 2020. Video Relation Detection via Multiple Hypothesis Association. In ACM MM. ACM Virtual Event \/ Seattle WA USA 3127\u20133135.","key":"e_1_3_2_1_30_1","DOI":"10.1145\/3394171.3413764"},{"volume-title":"ACM MM","author":"Sun Xu","unstructured":"Xu Sun, Tongwei Ren, Yuan Zi, and Gangshan Wu. 2019. Video visual relation detection via multi-modal feature fusion. In ACM MM. ACM, Nice, France, 2657\u20132661.","key":"e_1_3_2_1_31_1"},{"key":"e_1_3_2_1_32_1","volume-title":"LIGHTEN: Learning Interactions with Graph and Hierarchical TEmporal Networks for HOI in Videos. In ACM MM. ACM, Virtual Event \/ Seattle, WA, USA, 691\u2013699.","author":"Praneeth\u00a0Reddy Sunkesula Sai","year":"2020","unstructured":"Sai Praneeth\u00a0Reddy Sunkesula, Rishabh Dabral, and Ganesh Ramakrishnan. 2020. LIGHTEN: Learning Interactions with Graph and Hierarchical TEmporal Networks for HOI in Videos. In ACM MM. ACM, Virtual Event \/ Seattle, WA, USA, 691\u2013699."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_33_1","DOI":"10.1109\/TIP.2022.3181511"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_34_1","DOI":"10.1145\/2812802"},{"volume-title":"Striking the Right Balance: Recall Loss for Semantic Segmentation","author":"Tian Junjiao","unstructured":"Junjiao Tian, Niluthpol\u00a0Chowdhury Mithun, Zachary Seymour, Han-Pang Chiu, and Zsolt Kira. 2022. Striking the Right Balance: Recall Loss for Semantic Segmentation. In ICRA. IEEE, Philadelphia, PA, USA, 5063\u20135069.","key":"e_1_3_2_1_35_1"},{"volume-title":"Simple online and realtime tracking with a deep association metric","author":"Wojke Nicolai","unstructured":"Nicolai Wojke, Alex Bewley, and Dietrich Paulus. 2017. Simple online and realtime tracking with a deep association metric. In ICIP. IEEE, Beijing, China, 3645\u20133649.","key":"e_1_3_2_1_36_1"},{"volume-title":"Distribution-balanced loss for multi-label classification in long-tailed datasets","author":"Wu Tong","unstructured":"Tong Wu, Qingqiu Huang, Ziwei Liu, Yu Wang, and Dahua Lin. 2020. Distribution-balanced loss for multi-label classification in long-tailed datasets. In ECCV. Springer, Glasgow, UK, 162\u2013178.","key":"e_1_3_2_1_37_1"},{"doi-asserted-by":"crossref","unstructured":"Wentao Xie Guanghui Ren and Si Liu. 2020. Video Relation Detection with Trajectory-aware Multi-modal Features. In ACM MM. ACM Virtual Event \/ Seattle WA USA 4590\u20134594.","key":"e_1_3_2_1_38_1","DOI":"10.1145\/3394171.3416284"},{"volume-title":"Scene graph generation by iterative message passing","author":"Xu Danfei","unstructured":"Danfei Xu, Yuke Zhu, Christopher\u00a0B Choy, and Li Fei-Fei. 2017. Scene graph generation by iterative message passing. In CVPR. IEEE Computer Society, Honolulu, HI, USA, 3097\u20133106.","key":"e_1_3_2_1_39_1"},{"key":"e_1_3_2_1_40_1","volume-title":"Lille","author":"Xu Kelvin","year":"2015","unstructured":"Kelvin Xu, Jimmy Ba, Ryan Kiros, Kyunghyun Cho, Aaron Courville, Ruslan Salakhutdinov, Richard Zemel, and Yoshua Bengio. 2015. Show, Attend and Tell: Neural Image Caption Generation with Visual Attention. In ICML. JMLR.org, Lille, France, 2048\u20132057."},{"key":"e_1_3_2_1_41_1","volume-title":"PCPL: Predicate-Correlation Perception Learning for Unbiased Scene Graph Generation. In ACMMM. ACM, Virtual Event \/ Seattle, WA, USA, 265\u2013273.","author":"Yan Shaotian","year":"2020","unstructured":"Shaotian Yan, Chen Shen, Zhongming Jin, Jianqiang Huang, Rongxin Jiang, Yaowu Chen, and Xian-Sheng Hua. 2020. PCPL: Predicate-Correlation Perception Learning for Unbiased Scene Graph Generation. In ACMMM. ACM, Virtual Event \/ Seattle, WA, USA, 265\u2013273."},{"volume-title":"Graph r-cnn for scene graph generation","author":"Yang Jianwei","unstructured":"Jianwei Yang, Jiasen Lu, Stefan Lee, Dhruv Batra, and Devi Parikh. 2018. Graph r-cnn for scene graph generation. In ECCV. Springer, Munich, Germany, 690\u2013706.","key":"e_1_3_2_1_42_1"},{"volume-title":"Neural motifs: Scene graph parsing with global context","author":"Zellers Rowan","unstructured":"Rowan Zellers, Mark Yatskar, Sam Thomson, and Yejin Choi. 2018. Neural motifs: Scene graph parsing with global context. In CVPR. Computer Vision Foundation \/ IEEE Computer Society, Salt Lake City, UT, USA, 5831\u20135840.","key":"e_1_3_2_1_43_1"},{"unstructured":"Yifan Zhang Bingyi Kang Bryan Hooi Shuicheng Yan and Jiashi Feng. 2021. Deep long-tailed learning: A survey. arxiv:2110.04596","key":"e_1_3_2_1_44_1"},{"volume-title":"ACM MM","author":"Zheng Sipeng","unstructured":"Sipeng Zheng, Xiangyu Chen, Shizhe Chen, and Qin Jin. 2019. Relation understanding in videos. In ACM MM. ACM, Nice, France, 2662\u20132666.","key":"e_1_3_2_1_45_1"}],"event":{"sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"acronym":"ICMR '23","name":"ICMR '23: International Conference on Multimedia Retrieval","location":"Thessaloniki Greece"},"container-title":["Proceedings of the 2023 ACM International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3591106.3592267","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3591106.3592267","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:37:30Z","timestamp":1750178250000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3591106.3592267"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,12]]},"references-count":45,"alternative-id":["10.1145\/3591106.3592267","10.1145\/3591106"],"URL":"https:\/\/doi.org\/10.1145\/3591106.3592267","relation":{},"subject":[],"published":{"date-parts":[[2023,6,12]]},"assertion":[{"value":"2023-06-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}