{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T06:10:03Z","timestamp":1755843003932,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,7,10]],"date-time":"2024-07-10T00:00:00Z","timestamp":1720569600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Collaborative Innovation Center of Novel Software Technology and Industrialization"},{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61972192,62172208,61906085"],"award-info":[{"award-number":["61972192,62172208,61906085"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,7,10]]},"DOI":"10.1145\/3626772.3657795","type":"proceedings-article","created":{"date-parts":[[2024,7,11]],"date-time":"2024-07-11T12:40:05Z","timestamp":1720701605000},"page":"2167-2176","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Short Video Ordering via Position Decoding and Successor Prediction"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9198-5324","authenticated-orcid":false,"given":"Shiping","family":"Ge","sequence":"first","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3871-5350","authenticated-orcid":false,"given":"Qiang","family":"Chen","sequence":"additional","affiliation":[{"name":"Tencent WeChat, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5243-4992","authenticated-orcid":false,"given":"Zhiwei","family":"Jiang","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9497-6244","authenticated-orcid":false,"given":"Yafeng","family":"Yin","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0014-5406","authenticated-orcid":false,"given":"Ziyao","family":"Chen","sequence":"additional","affiliation":[{"name":"Tencent WeChat, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1112-790X","authenticated-orcid":false,"given":"Qing","family":"Gu","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,7,11]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1037\/a0017088"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00293"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/1273496.1273513"},{"key":"e_1_3_2_1_4_1","first-page":"2","article-title":"Deep Learning for Video Captioning: A Review","volume":"1","author":"Chen Shaoxiang","year":"2019","unstructured":"Shaoxiang Chen, Ting Yao, and Yu-Gang Jiang. 2019. Deep Learning for Video Captioning: A Review.. In IJCAI, Vol. 1. 2.","journal-title":"IJCAI"},{"key":"e_1_3_2_1_5_1","volume-title":"Neural sentence ordering. arXiv preprint arXiv:1607.06952","author":"Chen Xinchi","year":"2016","unstructured":"Xinchi Chen, Xipeng Qiu, and Xuanjing Huang. 2016. Neural sentence ordering. arXiv preprint arXiv:1607.06952 (2016)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1465"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.511"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2729019"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3543507.3583258"},{"key":"e_1_3_2_1_10_1","volume-title":"End-to-end neural sentence ordering using pointer network. arXiv preprint arXiv:1611.04953","author":"Gong Jingjing","year":"2016","unstructured":"Jingjing Gong, Xinchi Chen, Xipeng Qiu, and Xuanjing Huang. 2016. End-to-end neural sentence ordering using pointer network. arXiv preprint arXiv:1611.04953 (2016)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548436"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00999"},{"key":"e_1_3_2_1_13_1","volume-title":"Tvqa: Spatio-temporal grounding for video question answering. arXiv preprint arXiv:1904.11574","author":"Lei Jie","year":"2019","unstructured":"Jie Lei, Licheng Yu, Tamara L Berg, and Mohit Bansal. 2019. Tvqa: Spatio-temporal grounding for video question answering. arXiv preprint arXiv:1904.11574 (2019)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00490"},{"key":"e_1_3_2_1_15_1","volume-title":"Evaluating text coherence at sentence and paragraph levels. arXiv preprint arXiv:2006.03221","author":"Liu Sennan","year":"2020","unstructured":"Sennan Liu, Shuang Zeng, and Sujian Li. 2020. Evaluating text coherence at sentence and paragraph levels. arXiv preprint arXiv:2006.03221 (2020)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11997"},{"key":"e_1_3_2_1_17_1","volume-title":"Sgdr: Stochastic gradient descent with warm restarts. arXiv preprint arXiv:1608.03983","author":"Loshchilov Ilya","year":"2016","unstructured":"Ilya Loshchilov and Frank Hutter. 2016. Sgdr: Stochastic gradient descent with warm restarts. arXiv preprint arXiv:1608.03983 (2016)."},{"key":"e_1_3_2_1_18_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_19_1","volume-title":"Univl: A unified video and language pre-training model for multimodal understanding and generation. arXiv preprint arXiv:2002.06353","author":"Luo Huaishao","year":"2020","unstructured":"Huaishao Luo, Lei Ji, Botian Shi, Haoyang Huang, Nan Duan, Tianrui Li, Jason Li, Taroon Bharti, and Ming Zhou. 2020. Univl: A unified video and language pre-training model for multimodal understanding and generation. arXiv preprint arXiv:2002.06353 (2020)."},{"key":"e_1_3_2_1_20_1","volume-title":"Context is key: New approaches to neural coherence modeling. arXiv preprint arXiv:1812.04722","author":"McClure David","year":"2018","unstructured":"David McClure, Shayne O'Brien, and Deb Roy. 2018. Context is key: New approaches to neural coherence modeling. arXiv preprint arXiv:1812.04722 (2018)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00637"},{"key":"e_1_3_2_1_22_1","volume-title":"Topological sort for sentence ordering. arXiv preprint arXiv:2005.00432","author":"Prabhumoye Shrimai","year":"2020","unstructured":"Shrimai Prabhumoye, Ruslan Salakhutdinov, and Alan W Black. 2020. Topological sort for sentence ordering. arXiv preprint arXiv:2005.00432 (2020)."},{"key":"e_1_3_2_1_23_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.sigdial-1.16"},{"key":"e_1_3_2_1_25_1","volume-title":"Multimodal emotion recognition in response to videos","author":"Soleymani Mohammad","year":"2011","unstructured":"Mohammad Soleymani, Maja Pantic, and Thierry Pun. 2011. Multimodal emotion recognition in response to videos. IEEE transactions on affective computing, Vol. 3, 2 (2011), 211--223."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"e_1_3_2_1_27_1","volume-title":"Video question answering: a survey of models and datasets. Mobile Networks and Applications","author":"Sun Guanglu","year":"2021","unstructured":"Guanglu Sun, Lili Liang, Tianlin Li, Bo Yu, Meng Wu, and Bolun Zhang. 2021. Video question answering: a survey of models and datasets. Mobile Networks and Applications (2021), 1--34."},{"key":"e_1_3_2_1_28_1","article-title":"Visualizing data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","unstructured":"Laurens Van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research, Vol. 9, 11 (2008).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_29_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.515"},{"key":"e_1_3_2_1_31_1","volume-title":"Pointer networks. Advances in neural information processing systems","author":"Vinyals Oriol","year":"2015","unstructured":"Oriol Vinyals, Meire Fortunato, and Navdeep Jaitly. 2015. Pointer networks. Advances in neural information processing systems, Vol. 28 (2015)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00795"},{"key":"e_1_3_2_1_33_1","volume-title":"Controlling Class Layout for Deep Ordinal Classification via Constrained Proxies Learning. arXiv preprint arXiv:2303.00396","author":"Wang Cong","year":"2023","unstructured":"Cong Wang, Zhiwei Jiang, Yafeng Yin, Zifeng Cheng, Shiping Ge, and Qing Gu. 2023. Controlling Class Layout for Deep Ordinal Classification via Constrained Proxies Learning. arXiv preprint arXiv:2303.00396 (2023)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/1390156.1390306"},{"key":"e_1_3_2_1_35_1","volume-title":"Vlm: Task-agnostic video-language model pre-training for video understanding. arXiv preprint arXiv:2105.09996","author":"Xu Hu","year":"2021","unstructured":"Hu Xu, Gargi Ghosh, Po-Yao Huang, Prahal Arora, Masoumeh Aminzadeh, Christoph Feichtenhofer, Florian Metze, and Luke Zettlemoyer. 2021. Vlm: Task-agnostic video-language model pre-training for video understanding. arXiv preprint arXiv:2105.09996 (2021)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.512"}],"event":{"name":"SIGIR 2024: The 47th International ACM SIGIR Conference on Research and Development in Information Retrieval","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"],"location":"Washington DC USA","acronym":"SIGIR 2024"},"container-title":["Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3626772.3657795","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3626772.3657795","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T05:42:12Z","timestamp":1755841332000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3626772.3657795"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7,10]]},"references-count":36,"alternative-id":["10.1145\/3626772.3657795","10.1145\/3626772"],"URL":"https:\/\/doi.org\/10.1145\/3626772.3657795","relation":{},"subject":[],"published":{"date-parts":[[2024,7,10]]},"assertion":[{"value":"2024-07-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}