{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T18:50:16Z","timestamp":1755802216189,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100006374","name":"Science Foundation Ireland","doi-asserted-by":"publisher","award":["18\/CRT\/6223"],"award-info":[{"award-number":["18\/CRT\/6223"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"name":"ADAPT, the SFI Research Centre for AI-Driven Digital Content Technology","award":["13\/RC\/2106_P2"],"award-info":[{"award-number":["13\/RC\/2106_P2"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3658096","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"460-468","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["A Parallel Transformer Framework for Video Moment Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1356-9434","authenticated-orcid":false,"given":"Thao-Nhu","family":"Nguyen","sequence":"first","affiliation":[{"name":"Dublin City University, Dublin, Ireland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3300-1806","authenticated-orcid":false,"given":"Zongyao","family":"Li","sequence":"additional","affiliation":[{"name":"NEC Corporation, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4673-6924","authenticated-orcid":false,"given":"Yamazaki","family":"Satoshi","sequence":"additional","affiliation":[{"name":"NEC Corporation, Tokyo, JP"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4303-9020","authenticated-orcid":false,"given":"Jianquan","family":"Liu","sequence":"additional","affiliation":[{"name":"NEC Corporation, Tokyo, JP"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2903-3968","authenticated-orcid":false,"given":"Cathal","family":"Gurrin","sequence":"additional","affiliation":[{"name":"Dublin City University, Dublin, Ireland"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Is Space-Time Attention All You Need for Video Understanding? CoRR","author":"Bertasius Gedas","year":"2021","unstructured":"Gedas Bertasius, Heng Wang, and Lorenzo Torresani. 2021. Is Space-Time Attention All You Need for Video Understanding? CoRR, Vol. abs\/2102.05095 (2021). showeprint[arXiv]2102.05095 https:\/\/arxiv.org\/abs\/2102.05095"},{"key":"e_1_3_2_1_2_1","volume-title":"Davis","author":"Bodla Navaneeth","year":"2017","unstructured":"Navaneeth Bodla, Bharat Singh, Rama Chellappa, and Larry S. Davis. 2017. Soft-NMS - Improving Object Detection With One Line of Code. (2017)."},{"key":"e_1_3_2_1_3_1","volume-title":"End-to-End Object Detection with Transformers. CoRR","author":"Carion Nicolas","year":"2020","unstructured":"Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. 2020. End-to-End Object Detection with Transformers. CoRR, Vol. abs\/2005.12872 (2020). [arXiv]2005.12872 https:\/\/arxiv.org\/abs\/2005.12872"},{"key":"e_1_3_2_1_4_1","volume-title":"Action Recognition? A New Model and the Kinetics Dataset. CoRR","author":"Carreira Jo","year":"2017","unstructured":"Jo a o Carreira and Andrew Zisserman. 2017a. Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset. CoRR, Vol. abs\/1705.07750 (2017). [arXiv]1705.07750 http:\/\/arxiv.org\/abs\/1705.07750"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1015"},{"key":"e_1_3_2_1_7_1","volume-title":"Fine-grained Video-Text Retrieval with Hierarchical Graph Reasoning. CoRR","author":"Chen Shizhe","year":"2020","unstructured":"Shizhe Chen, Yida Zhao, Qin Jin, and Qi Wu. 2020. Fine-grained Video-Text Retrieval with Hierarchical Graph Reasoning. CoRR, Vol. abs\/2003.00392 (2020). [arXiv]2003.00392 https:\/\/arxiv.org\/abs\/2003.00392"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3268066"},{"key":"e_1_3_2_1_9_1","volume-title":"Kunz","author":"Dhingra Naina","year":"2021","unstructured":"Naina Dhingra, Florian Ritter, and Andreas M. Kunz. 2021. BGT-Net: Bidirectional GRU Transformer Network for Scene Graph Generation. CoRR, Vol. abs\/2109.05346 (2021). [arXiv]2109.05346 https:\/\/arxiv.org\/abs\/2109.05346"},{"key":"e_1_3_2_1_10_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_11_1","volume-title":"A Generalization of Transformer Networks to Graphs. CoRR","author":"Dwivedi Vijay Prakash","year":"2020","unstructured":"Vijay Prakash Dwivedi and Xavier Bresson. 2020. A Generalization of Transformer Networks to Graphs. CoRR, Vol. abs\/2012.09699 (2020). showeprint[arXiv]2012.09699 https:\/\/arxiv.org\/abs\/2012.09699"},{"key":"e_1_3_2_1_12_1","volume-title":"Vasileios Argyriou, Dorothy Monekosso, and Paolo Remagnino.","author":"Fajtl Jiri","year":"2018","unstructured":"Jiri Fajtl, Hajar Sadeghi Sokeh, Vasileios Argyriou, Dorothy Monekosso, and Paolo Remagnino. 2018. Summarizing Videos with Attention. CoRR, Vol. abs\/1812.01969 (2018). showeprint[arXiv]1812.01969 http:\/\/arxiv.org\/abs\/1812.01969"},{"key":"e_1_3_2_1_13_1","volume-title":"X3D: Expanding Architectures for Efficient Video Recognition. CoRR","author":"Feichtenhofer Christoph","year":"2020","unstructured":"Christoph Feichtenhofer. 2020. X3D: Expanding Architectures for Efficient Video Recognition. CoRR, Vol. abs\/2004.04730 (2020). [arXiv]2004.04730 https:\/\/arxiv.org\/abs\/2004.04730"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.563"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00155"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2729019"},{"key":"e_1_3_2_1_18_1","volume-title":"Unpaired Image Captioning via Scene Graph Alignments. CoRR","author":"Gu Jiuxiang","year":"2019","unstructured":"Jiuxiang Gu, Shafiq R. Joty, Jianfei Cai, Handong Zhao, Xu Yang, and Gang Wang. 2019. Unpaired Image Captioning via Scene Graph Alignments. CoRR, Vol. abs\/1903.10658 (2019). [arXiv]1903.10658 http:\/\/arxiv.org\/abs\/1903.10658"},{"key":"e_1_3_2_1_19_1","volume-title":"Action Genome: Actions as Composition of Spatio-temporal Scene Graphs. CoRR","author":"Ji Jingwei","year":"2019","unstructured":"Jingwei Ji, Ranjay Krishna, Li Fei-Fei, and Juan Carlos Niebles. 2019. Action Genome: Actions as Composition of Spatio-temporal Scene Graphs. CoRR, Vol. abs\/1912.06992 (2019). [arXiv]1912.06992 http:\/\/arxiv.org\/abs\/1912.06992"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298990"},{"key":"e_1_3_2_1_21_1","volume-title":"QVHighlights: Detecting Moments and Highlights in Videos via Natural Language Queries. CoRR","author":"Lei Jie","year":"2021","unstructured":"Jie Lei, Tamara L. Berg, and Mohit Bansal. 2021a. QVHighlights: Detecting Moments and Highlights in Videos via Natural Language Queries. CoRR, Vol. abs\/2107.09609 (2021). [arXiv]2107.09609 https:\/\/arxiv.org\/abs\/2107.09609"},{"key":"e_1_3_2_1_22_1","volume-title":"Less is More: ClipBERT for Video-and-Language Learning via Sparse Sampling. CoRR","author":"Lei Jie","year":"2021","unstructured":"Jie Lei, Linjie Li, Luowei Zhou, Zhe Gan, Tamara L. Berg, Mohit Bansal, and Jingjing Liu. 2021b. Less is More: ClipBERT for Video-and-Language Learning via Sparse Sampling. CoRR, Vol. abs\/2102.06183 (2021). showeprint[arXiv]2102.06183 https:\/\/arxiv.org\/abs\/2102.06183"},{"key":"e_1_3_2_1_23_1","volume-title":"Scene Graph Generation from Objects, Phrases and Caption Regions. CoRR","author":"Li Yikang","year":"2017","unstructured":"Yikang Li, Wanli Ouyang, Bolei Zhou, Kun Wang, and Xiaogang Wang. 2017. Scene Graph Generation from Objects, Phrases and Caption Regions. CoRR, Vol. abs\/1707.09700 (2017). [arXiv]1707.09700 http:\/\/arxiv.org\/abs\/1707.09700"},{"key":"e_1_3_2_1_24_1","volume-title":"Belongie","author":"Lin Tsung-Yi","year":"2016","unstructured":"Tsung-Yi Lin, Piotr Doll\u00e1r, Ross B. Girshick, Kaiming He, Bharath Hariharan, and Serge J. Belongie. 2016. Feature Pyramid Networks for Object Detection. CoRR, Vol. abs\/1612.03144 (2016). [arXiv]1612.03144 http:\/\/arxiv.org\/abs\/1612.03144"},{"key":"e_1_3_2_1_25_1","volume-title":"Fixing Weight Decay Regularization in Adam. CoRR","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Fixing Weight Decay Regularization in Adam. CoRR, Vol. abs\/1711.05101 (2017). showeprint[arXiv]1711.05101 http:\/\/arxiv.org\/abs\/1711.05101"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","unstructured":"Chujie Lu Long Chen Chilie Tan Xiaolin Li and Jun Xiao. 2019. DEBUG: A Dense Bottom-Up Grounding Approach for Natural Language Video Localization. In Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP) Kentaro Inui Jing Jiang Vincent Ng and Xiaojun Wan (Eds.). Association for Computational Linguistics Hong Kong China 5144--5153. https:\/\/doi.org\/10.18653\/v1\/D19-1518","DOI":"10.18653\/v1\/D19-1518"},{"key":"e_1_3_2_1_27_1","volume-title":"Visual Relationship Detection with Language Priors. CoRR","author":"Lu Cewu","year":"2016","unstructured":"Cewu Lu, Ranjay Krishna, Michael S. Bernstein, and Li Fei-Fei. 2016. Visual Relationship Detection with Language Priors. CoRR, Vol. abs\/1608.00187 (2016). [arXiv]1608.00187 http:\/\/arxiv.org\/abs\/1608.00187"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2014.08.002"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02205"},{"key":"e_1_3_2_1_30_1","volume-title":"Local-Global Video-Text Interactions for Temporal Grounding. CoRR","author":"Mun Jonghwan","year":"2020","unstructured":"Jonghwan Mun, Minsu Cho, and Bohyung Han. 2020. Local-Global Video-Text Interactions for Temporal Grounding. CoRR, Vol. abs\/2004.07514 (2020). [arXiv]2004.07514 https:\/\/arxiv.org\/abs\/2004.07514"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01088"},{"key":"e_1_3_2_1_32_1","volume-title":"PyTorch: An Imperative Style","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas K\u00f6pf, Edward Z. Yang, Zach DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. CoRR, Vol. abs\/1912.01703 (2019). [arXiv]1912.01703 http:\/\/arxiv.org\/abs\/1912.01703"},{"key":"e_1_3_2_1_33_1","volume-title":"VLG-Net: Video-Language Graph Matching Network for Video Grounding. CoRR","author":"Qu Sally Sisi","year":"2020","unstructured":"Sally Sisi Qu, Mattia Soldan, Mengmeng Xu, Jesper Tegn\u00e9r, and Bernard Ghanem. 2020. VLG-Net: Video-Language Graph Matching Network for Video Grounding. CoRR, Vol. abs\/2011.10132 (2020). [arXiv]2011.10132 https:\/\/arxiv.org\/abs\/2011.10132"},{"key":"e_1_3_2_1_34_1","volume-title":"Chris Hallacy, A. Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, A. Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In ICML."},{"key":"e_1_3_2_1_35_1","volume-title":"Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. CoRR","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross B. Girshick, and Jian Sun. 2015. Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. CoRR, Vol. abs\/1506.01497 (2015). [arXiv]1506.01497 http:\/\/arxiv.org\/abs\/1506.01497"},{"key":"e_1_3_2_1_36_1","volume-title":"Hollywood in Homes: Crowdsourcing Data Collection for Activity Understanding. CoRR","author":"Sigurdsson Gunnar A.","year":"2016","unstructured":"Gunnar A. Sigurdsson, G\u00fcl Varol, Xiaolong Wang, Ali Farhadi, Ivan Laptev, and Abhinav Gupta. 2016. Hollywood in Homes: Crowdsourcing Data Collection for Activity Understanding. CoRR, Vol. abs\/1604.01753 (2016). showeprint[arXiv]1604.01753 http:\/\/arxiv.org\/abs\/1604.01753"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2021.3110713"},{"key":"e_1_3_2_1_38_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_39_1","volume-title":"Temporal Segment Networks: Towards Good Practices for Deep Action Recognition. CoRR","author":"Wang Limin","year":"2016","unstructured":"Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, and Luc Van Gool. 2016. Temporal Segment Networks: Towards Good Practices for Deep Action Recognition. CoRR, Vol. abs\/1608.00859 (2016). [arXiv]1608.00859 http:\/\/arxiv.org\/abs\/1608.00859"},{"key":"e_1_3_2_1_40_1","volume-title":"Cross-modal Scene Graph Matching for Relationship-aware Image-Text Retrieval. CoRR","author":"Wang Sijin","year":"2019","unstructured":"Sijin Wang, Ruiping Wang, Ziwei Yao, Shiguang Shan, and Xilin Chen. 2019. Cross-modal Scene Graph Matching for Relationship-aware Image-Text Retrieval. CoRR, Vol. abs\/1910.05134 (2019). [arXiv]1910.05134 http:\/\/arxiv.org\/abs\/1910.05134"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01139"},{"key":"e_1_3_2_1_42_1","volume-title":"Boundary Proposal Network for Two-Stage Natural Language Video Localization. CoRR","author":"Xiao Shaoning","year":"2021","unstructured":"Shaoning Xiao, Long Chen, Songyang Zhang, Wei Ji, Jian Shao, Lu Ye, and Jun Xiao. 2021. Boundary Proposal Network for Two-Stage Natural Language Video Localization. CoRR, Vol. abs\/2103.08109 (2021). [arXiv]2103.08109 https:\/\/arxiv.org\/abs\/2103.08109"},{"key":"e_1_3_2_1_43_1","unstructured":"Yicheng Xiao Zhuoyan Luo Yong Liu Yue Ma Hengwei Bian Yatai Ji Yujiu Yang and Xiu Li. 2023. Bridging the Gap: A Unified Video Comprehension Framework for Moment Retrieval and Highlight Detection. arxiv: 2311.16464 [cs.CV]"},{"key":"e_1_3_2_1_44_1","volume-title":"Scene Graph Generation by Iterative Message Passing. CoRR","author":"Xu Danfei","year":"2017","unstructured":"Danfei Xu, Yuke Zhu, Christopher B. Choy, and Li Fei-Fei. 2017. Scene Graph Generation by Iterative Message Passing. CoRR, Vol. abs\/1701.02426 (2017). [arXiv]1701.02426 http:\/\/arxiv.org\/abs\/1701.02426"},{"key":"e_1_3_2_1_45_1","volume-title":"MH-DETR: Video Moment and Highlight Detection with Cross-modal Transformer. arXiv preprint arXiv:2305.00355","author":"Xu Yifang","year":"2023","unstructured":"Yifang Xu, Yunzhuo Sun, Yang Li, Yilei Shi, Xiaoxiang Zhu, and Sidan Du. 2023. MH-DETR: Video Moment and Highlight Detection with Cross-modal Transformer. arXiv preprint arXiv:2305.00355 (2023)."},{"key":"e_1_3_2_1_46_1","volume-title":"Multiview Transformers for Video Recognition. CoRR","author":"Yan Shen","year":"2022","unstructured":"Shen Yan, Xuehan Xiong, Anurag Arnab, Zhichao Lu, Mi Zhang, Chen Sun, and Cordelia Schmid. 2022. Multiview Transformers for Video Recognition. CoRR, Vol. abs\/2201.04288 (2022). [arXiv]2201.04288 https:\/\/arxiv.org\/abs\/2201.04288"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01253"},{"key":"e_1_3_2_1_48_1","volume-title":"Graph R-CNN for Scene Graph Generation. CoRR","author":"Yang Jianwei","year":"2018","unstructured":"Jianwei Yang, Jiasen Lu, Stefan Lee, Dhruv Batra, and Devi Parikh. 2018. Graph R-CNN for Scene Graph Generation. CoRR, Vol. abs\/1808.00191 (2018). showeprint[arXiv]1808.00191 http:\/\/arxiv.org\/abs\/1808.00191"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01094"},{"key":"e_1_3_2_1_50_1","volume-title":"Exploring Visual Relationship for Image Captioning. CoRR","author":"Yao Ting","year":"2018","unstructured":"Ting Yao, Yingwei Pan, Yehao Li, and Tao Mei. 2018. Exploring Visual Relationship for Image Captioning. CoRR, Vol. abs\/1809.07041 (2018). [arXiv]1809.07041 http:\/\/arxiv.org\/abs\/1809.07041"},{"key":"e_1_3_2_1_51_1","volume-title":"Neural Motifs: Scene Graph Parsing with Global Context. CoRR","author":"Zellers Rowan","year":"2017","unstructured":"Rowan Zellers, Mark Yatskar, Sam Thomson, and Yejin Choi. 2017. Neural Motifs: Scene Graph Parsing with Global Context. CoRR, Vol. abs\/1711.06640 (2017). [arXiv]1711.06640 http:\/\/arxiv.org\/abs\/1711.06640"},{"key":"e_1_3_2_1_52_1","volume-title":"Dense Regression Network for Video Grounding. CoRR","author":"Zeng Runhao","year":"2020","unstructured":"Runhao Zeng, Haoming Xu, Wenbing Huang, Peihao Chen, Mingkui Tan, and Chuang Gan. 2020. Dense Regression Network for Video Grounding. CoRR, Vol. abs\/2004.03545 (2020). [arXiv]2004.03545 https:\/\/arxiv.org\/abs\/2004.03545"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00225"},{"key":"e_1_3_2_1_54_1","volume-title":"ActionFormer: Localizing Moments of Actions with Transformers. In European Conference on Computer Vision (LNCS","volume":"510","author":"Zhang Chen-Lin","year":"2022","unstructured":"Chen-Lin Zhang, Jianxin Wu, and Yin Li. 2022. ActionFormer: Localizing Moments of Actions with Transformers. In European Conference on Computer Vision (LNCS, Vol. 13664). 492--510."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.585"},{"key":"e_1_3_2_1_56_1","volume-title":"Graphical Contrastive Losses for Scene Graph Generation. CoRR","author":"Zhang Ji","year":"2019","unstructured":"Ji Zhang, Kevin J. Shih, Ahmed Elgammal, Andrew Tao, and Bryan Catanzaro. 2019c. Graphical Contrastive Losses for Scene Graph Generation. CoRR, Vol. abs\/1903.02728 (2019). [arXiv]1903.02728 http:\/\/arxiv.org\/abs\/1903.02728"},{"key":"e_1_3_2_1_57_1","volume-title":"Learning 2D Temporal Adjacent Networks for Moment Localization with Natural Language. CoRR","author":"Zhang Songyang","year":"2019","unstructured":"Songyang Zhang, Houwen Peng, Jianlong Fu, and Jiebo Luo. 2019b. Learning 2D Temporal Adjacent Networks for Moment Localization with Natural Language. CoRR, Vol. abs\/1912.03590 (2019). [arXiv]1912.03590 http:\/\/arxiv.org\/abs\/1912.03590"},{"key":"e_1_3_2_1_58_1","volume-title":"Multi-Modal Interaction Graph Convolutional Network for Temporal Language Localization in Videos. CoRR","author":"Zhang Zongmeng","year":"2021","unstructured":"Zongmeng Zhang, Xianjing Han, Xuemeng Song, Yan Yan, and Liqiang Nie. 2021. Multi-Modal Interaction Graph Convolutional Network for Temporal Language Localization in Videos. CoRR, Vol. abs\/2110.06058 (2021). [arXiv]2110.06058 https:\/\/arxiv.org\/abs\/2110.06058"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3331184.3331235"},{"key":"e_1_3_2_1_60_1","volume-title":"Deep Reinforcement Learning for Unsupervised Video Summarization with Diversity-Representativeness Reward. CoRR","author":"Zhou Kaiyang","year":"2018","unstructured":"Kaiyang Zhou and Yu Qiao. 2018. Deep Reinforcement Learning for Unsupervised Video Summarization with Diversity-Representativeness Reward. CoRR (2018). http:\/\/arxiv.org\/abs\/1801.00054"}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"],"location":"Phuket Thailand","acronym":"ICMR '24"},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658096","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3658096","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:48:00Z","timestamp":1755766080000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658096"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":60,"alternative-id":["10.1145\/3652583.3658096","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3658096","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}