{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,11]],"date-time":"2026-01-11T01:12:57Z","timestamp":1768093977012,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Technology Innovation Committee of Shenzhen Municipality Foundation","award":["JCYJ20210324132203007"],"award-info":[{"award-number":["JCYJ20210324132203007"]}]},{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62222203, 62072080, 62306065"],"award-info":[{"award-number":["62222203, 62072080, 62306065"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Guangdong Basic and Applied Basic Research Foundation","award":["2023A1515140104"],"award-info":[{"award-number":["2023A1515140104"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3658113","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"618-627","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["PTAN: Principal Token-aware Adjacent Network for Compositional Temporal Grounding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-6480-3457","authenticated-orcid":false,"given":"Zhuoyuan","family":"Wei","sequence":"first","affiliation":[{"name":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2209-651X","authenticated-orcid":false,"given":"Xun","family":"Jiang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9318-0084","authenticated-orcid":false,"given":"Zheng","family":"Wang","sequence":"additional","affiliation":[{"name":"College of Electronic and Information Engineering, Tongji University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7303-3231","authenticated-orcid":false,"given":"Fumin","family":"Shen","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5685-3123","authenticated-orcid":false,"given":"Xing","family":"Xu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.563"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_1_3_1","volume-title":"A closer look at temporal sentence grounding in videos: Datasets and metrics. arXiv preprint arXiv:2101.09028","author":"Yuan Yitian","year":"2021","unstructured":"Yitian Yuan, Xiaohan Lan, Long Chen, Wei Liu, Xin Wang, and Wenwu Zhu. A closer look at temporal sentence grounding in videos: Datasets and metrics. arXiv preprint arXiv:2101.09028, 2021."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00304"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02215"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6984"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3120745"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01082"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01511"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02207"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01830"},{"key":"e_1_3_2_1_12_1","first-page":"4171","volume-title":"Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. BERT: pre-training of deep bidirectional transformers for language understanding. In Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pages 4171--4186, 2019."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20163"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612394"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3532078"},{"key":"e_1_3_2_1_16_1","article-title":"Semantic decoupling network for temporal language grounding","author":"Jiang Xun","year":"2022","unstructured":"Xun Jiang, Xing Xu, Jingran Zhang, Fumin Shen, Zuo Cao, and Heng Tao Shen. Sdn: Semantic decoupling network for temporal language grounding. IEEE Transactions on Neural Networks and Learning Systems, pages 1--15, 2022.","journal-title":"IEEE Transactions on Neural Networks and Learning Systems, pages 1--15"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3063631"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i3.16285"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.585"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01248"},{"key":"e_1_3_2_1_21_1","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. Learning transferable visual models from natural language supervision. In International conference on machine learning, pages 8748--8763. PMLR, 2021."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01170"},{"key":"e_1_3_2_1_23_1","volume-title":"Measuring compositional generalization: A comprehensive method on realistic data","author":"Keysers Daniel","year":"2020","unstructured":"Daniel Keysers, Nathanael Sch\u00e4rli, Nathan Scales, Hylke Buisman, Daniel Furrer, Sergii Kashubin, Nikola Momchev, Danila Sinopalnikov, Lukasz Stafiniak, Tibor Tihon, Dmitry Tsarkov, Xiao Wang, Marc van Zee, and Olivier Bousquet. Measuring compositional generalization: A comprehensive method on realistic data, 2020."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/K19-1009"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1016\/B978-044481714-3\/50011-4"},{"key":"e_1_3_2_1_26_1","volume-title":"Building machines that learn and think like people. Behavioral and brain sciences, 40:e253","author":"Lake Brenden M","year":"2017","unstructured":"Brenden M Lake, Tomer D Ullman, Joshua B Tenenbaum, and Samuel J Gershman. Building machines that learn and think like people. Behavioral and brain sciences, 40:e253, 2017."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00361"},{"key":"e_1_3_2_1_28_1","volume-title":"International Conference on Learning Representations","author":"Hudson Drew A","year":"2018","unstructured":"Drew A Hudson and Christopher D Manning. Compositional attention networks for machine reasoning. In International Conference on Learning Representations, 2018."},{"key":"e_1_3_2_1_29_1","first-page":"3070","article-title":"Multimodal graph networks for compositional generalization in visual question answering","volume":"33","author":"Saqur Raeid","year":"2020","unstructured":"Raeid Saqur and Karthik Narasimhan. Multimodal graph networks for compositional generalization in visual question answering. Advances in Neural Information Processing Systems, 33:3070--3081, 2020.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20243"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.731"},{"key":"e_1_3_2_1_32_1","volume-title":"Compositional multi-object reinforcement learning with linear relation networks. arXiv preprint arXiv:2201.13388","author":"Mambelli Davide","year":"2022","unstructured":"Davide Mambelli, Frederik Tr\u00e4uble, Stefan Bauer, Bernhard Sch\u00f6lkopf, and Francesco Locatello. Compositional multi-object reinforcement learning with linear relation networks. arXiv preprint arXiv:2201.13388, 2022."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3239229"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16452"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-emnlp.157"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01113"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.215"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00113"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00054"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548309"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00277"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3374111"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612244"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i4.28101"},{"key":"e_1_3_2_1_45_1","article-title":"Multigrained attention network with mutual exclusion for composed query-based image retrieval","author":"Li Shenshen","year":"2023","unstructured":"Shenshen Li, Xing Xu, Xun Jiang, Fumin Shen, Xin Liu, and Heng Tao Shen. Multigrained attention network with mutual exclusion for composed query-based image retrieval. IEEE Transactions on Circuits and Systems for Video Technology, pages 1--1, 2023.","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology, pages 1--1"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612349"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.3115\/1118108.1118117"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_1_51_1","volume-title":"Towards balanced alignment: Modal-enhanced semantic modeling for video moment retrieval. arXiv preprint arXiv:2312.12155","author":"Liu Zhihang","year":"2023","unstructured":"Zhihang Liu, Jun Li, Hongtao Xie, Pandeng Li, Jiannan Ge, Sun-Ao Liu, and Guoqing Jin. Towards balanced alignment: Modal-enhanced semantic modeling for video moment retrieval. arXiv preprint arXiv:2312.12155, 2023."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"e_1_3_2_1_53_1","volume-title":"International Conference on Learning Representations","author":"Diederik","year":"2015","unstructured":"Diederik P. Kingma and Jimmy Ba. Adam: A method for stochastic optimization. In International Conference on Learning Representations, 2015."}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","location":"Phuket Thailand","acronym":"ICMR '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"]},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658113","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3658113","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:52:15Z","timestamp":1755766335000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658113"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":53,"alternative-id":["10.1145\/3652583.3658113","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3658113","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}