{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T13:08:25Z","timestamp":1765544905627,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":33,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,7,18]],"date-time":"2023-07-18T00:00:00Z","timestamp":1689638400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100020409","name":"Analytical Center for the Government of the Russian Federation","doi-asserted-by":"publisher","award":["000000D730321P5Q0002"],"award-info":[{"award-number":["000000D730321P5Q0002"]}],"id":[{"id":"10.13039\/100020409","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,7,19]]},"DOI":"10.1145\/3539618.3592064","type":"proceedings-article","created":{"date-parts":[[2023,7,19]],"date-time":"2023-07-19T00:22:23Z","timestamp":1689726143000},"page":"2394-2398","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Sinkhorn Transformations for Single-Query Postprocessing in Text-Video Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-9397-6081","authenticated-orcid":false,"given":"Konstantin","family":"Yakovlev","sequence":"first","affiliation":[{"name":"Huawei Noah's Ark Lab, Moscow, Russian Fed."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-7536-9670","authenticated-orcid":false,"given":"Gregory","family":"Polyakov","sequence":"additional","affiliation":[{"name":"Huawei Noah's Ark Lab, Moscow, Russian Fed."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4528-6631","authenticated-orcid":false,"given":"Ilseyar","family":"Alimova","sequence":"additional","affiliation":[{"name":"Huawei Noah's Ark Lab, Moscow, Russian Fed."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2892-7356","authenticated-orcid":false,"given":"Alexander","family":"Podolskiy","sequence":"additional","affiliation":[{"name":"Huawei Noah's Ark Lab, Moscow, Russian Fed."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8786-2269","authenticated-orcid":false,"given":"Andrey","family":"Bout","sequence":"additional","affiliation":[{"name":"Huawei Noah's Ark Lab, Moscow, Russian Fed."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7787-2251","authenticated-orcid":false,"given":"Sergey","family":"Nikolenko","sequence":"additional","affiliation":[{"name":"Ivannikov Institute for System Programming of the RAS &amp; St. Petersburg Department of the Steklov Institute of Mathematics, Moscow, Russian Fed."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0299-5849","authenticated-orcid":false,"given":"Irina","family":"Piontkovskaya","sequence":"additional","affiliation":[{"name":"Huawei Noah's Ark Lab, Moscow, Russian Fed."}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,7,18]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_2_2_1","volume-title":"Is Space-Time Attention All You Need for Video Understanding? CoRR","author":"Bertasius Gedas","year":"2021","unstructured":"Gedas Bertasius, Heng Wang, and Lorenzo Torresani. 2021. Is Space-Time Attention All You Need for Video Understanding? CoRR, Vol. abs\/2102.05095 (2021). [arXiv]2102.05095 https:\/\/arxiv.org\/abs\/2102.05095"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00513"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.5555\/2002472.2002497"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2210.11929"},{"key":"e_1_3_2_2_6_1","volume-title":"Improving Video-Text Retrieval by Multi-Stream Corpus Alignment and Dual Softmax Loss. CoRR","author":"Cheng Xing","year":"2021","unstructured":"Xing Cheng, Hezheng Lin, Xiangyu Wu, Fan Yang, and Dong Shen. 2021. Improving Video-Text Retrieval by Multi-Stream Corpus Alignment and Dual Softmax Loss. CoRR, Vol. abs\/2109.04290 (2021). [arXiv]2109.04290 https:\/\/arxiv.org\/abs\/2109.04290"},{"key":"e_1_3_2_2_7_1","volume-title":"Weinberger (Eds.)","volume":"26","author":"Cuturi Marco","year":"2013","unstructured":"Marco Cuturi. 2013. Sinkhorn Distances: Lightspeed Computation of Optimal Transport. In Advances in Neural Information Processing Systems, C.J. Burges, L. Bottou, M. Welling, Z. Ghahramani, and K.Q. Weinberger (Eds.), Vol. 26. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper\/2013\/file\/af21d0c97db2e27e13572cbf59eb343d-Paper.pdf"},{"key":"e_1_3_2_2_8_1","volume-title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. CoRR","author":"Dosovitskiy Alexey","year":"1929","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2020. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. CoRR, Vol. abs\/2010.11929 (2020). [arXiv]2010.11929 https:\/\/arxiv.org\/abs\/2010.11929"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00495"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSMCC.2011.2109710"},{"key":"e_1_3_2_2_12_1","volume-title":"5th International Conference on Learning Representations, ICLR","author":"Jang Eric","year":"2017","unstructured":"Eric Jang, Shixiang Gu, and Ben Poole. 2017. Categorical Reparameterization with Gumbel-Softmax. In 5th International Conference on Learning Representations, ICLR 2017, Toulon, France, April 24-26, 2017, Conference Track Proceedings. OpenReview.net. https:\/\/openreview.net\/forum?id=rkE3y85ee"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","unstructured":"Jie Jiang Shaobo Min Weijie Kong Dihong Gong Hongfa Wang Zhifeng Li and Wei Liu. 2022. Tencent Text-Video Retrieval: Hierarchical Cross-Modal Interactions with Multi-Level Representations. https:\/\/doi.org\/10.48550\/ARXIV.2204.03382","DOI":"10.48550\/ARXIV.2204.03382"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_2_15_1","volume-title":"Less is More: ClipBERT for Video-and-Language Learning via Sparse Sampling. CoRR","author":"Lei Jie","year":"2021","unstructured":"Jie Lei, Linjie Li, Luowei Zhou, Zhe Gan, Tamara L. Berg, Mohit Bansal, and Jingjing Liu. 2021. Less is More: ClipBERT for Video-and-Language Learning via Sparse Sampling. CoRR, Vol. abs\/2102.06183 (2021). [arXiv]2102.06183 https:\/\/arxiv.org\/abs\/2102.06183"},{"key":"e_1_3_2_2_16_1","volume-title":"International Conference on Machine Learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888--12900."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","unstructured":"Yiwei Ma Guohai Xu Xiaoshuai Sun Ming Yan Ji Zhang and Rongrong Ji. 2022. X-CLIP: End-to-End Multi-grained Contrastive Learning for Video-Text Retrieval. https:\/\/doi.org\/10.48550\/ARXIV.2207.07285","DOI":"10.48550\/ARXIV.2207.07285"},{"key":"e_1_3_2_2_19_1","volume-title":"EdiT5: Semi-Autoregressive Text-Editing with T5 Warm-Start. ArXiv","author":"Mallinson Jonathan","year":"2022","unstructured":"Jonathan Mallinson, Jakub Adamek, Eric Malmi, and Aliaksei Severyn. 2022. EdiT5: Semi-Autoregressive Text-Editing with T5 Warm-Start. ArXiv, Vol. abs\/2205.12209 (2022)."},{"key":"e_1_3_2_2_20_1","volume-title":"Learning Latent Permutations with Gumbel-Sinkhorn Networks. ArXiv","author":"Mena Gonzalo E.","year":"2018","unstructured":"Gonzalo E. Mena, David Belanger, Scott W. Linderman, and Jasper Snoek. 2018. Learning Latent Permutations with Gumbel-Sinkhorn Networks. ArXiv, Vol. abs\/1802.08665 (2018)."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"e_1_3_2_2_22_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021a. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_2_23_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021b. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 139), Marina Meila and Tong Zhang (Eds.). PMLR, 8748--8763. https:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"e_1_3_2_2_25_1","volume-title":"Advances in Neural Information Processing Systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems, I. Guyon, U. Von Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett (Eds.), Vol. 30. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","unstructured":"Qiang Wang Yanhao Zhang Yun Zheng Pan Pan and Xian-Sheng Hua. 2022b. Disentangled Representation Learning for Text-Video Retrieval. https:\/\/doi.org\/10.48550\/ARXIV.2203.07111","DOI":"10.48550\/ARXIV.2203.07111"},{"key":"e_1_3_2_2_27_1","volume-title":"Disentangled Representation Learning for Text-Video Retrieval. arXiv:2203.07111","author":"Wang Qiang","year":"2022","unstructured":"Qiang Wang, Yanhao Zhang, Yun Zheng, Pan Pan, and Xian-Sheng Hua. 2022c. Disentangled Representation Learning for Text-Video Retrieval. arXiv:2203.07111 (2022)."},{"key":"e_1_3_2_2_28_1","volume-title":"VATEX: A Large-Scale, High-Quality Multilingual Dataset for Video-and-Language Research. CoRR","author":"Wang Xin","year":"2019","unstructured":"Xin Wang, Jiawei Wu, Junkun Chen, Lei Li, Yuan-Fang Wang, and William Yang Wang. 2019. VATEX: A Large-Scale, High-Quality Multilingual Dataset for Video-and-Language Research. CoRR, Vol. abs\/1904.03493 (2019). [arXiv]1904.03493 http:\/\/arxiv.org\/abs\/1904.03493"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","unstructured":"Yi Wang Kunchang Li Yizhuo Li Yinan He Bingkun Huang Zhiyu Zhao Hongjie Zhang Jilan Xu Yi Liu Zun Wang Sen Xing Guo Chen Junting Pan Jiashuo Yu Yali Wang Limin Wang and Yu Qiao. 2022a. InternVideo: General Video Foundation Models via Generative and Discriminative Learning. https:\/\/doi.org\/10.48550\/ARXIV.2212.03191","DOI":"10.48550\/ARXIV.2212.03191"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","unstructured":"Hongwei Xue Yuchong Sun Bei Liu Jianlong Fu Ruihua Song Houqiang Li and Jiebo Luo. 2022. CLIP-ViP: Adapting Pre-trained Image-Text Model to Video-Language Representation Alignment. https:\/\/doi.org\/10.48550\/ARXIV.2209.06430","DOI":"10.48550\/ARXIV.2209.06430"},{"key":"e_1_3_2_2_33_1","volume-title":"Corso","author":"Zhou Luowei","year":"2017","unstructured":"Luowei Zhou, Chenliang Xu, and Jason J. Corso. 2017. ProcNets: Learning to Segment Procedures in Untrimmed and Unconstrained Videos. CoRR, Vol. abs\/1703.09788 (2017). [arXiv]1703.09788 http:\/\/arxiv.org\/abs\/1703.09788"}],"event":{"name":"SIGIR '23: The 46th International ACM SIGIR Conference on Research and Development in Information Retrieval","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"],"location":"Taipei Taiwan","acronym":"SIGIR '23"},"container-title":["Proceedings of the 46th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3539618.3592064","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3539618.3592064","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:38:03Z","timestamp":1750178283000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3539618.3592064"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,7,18]]},"references-count":33,"alternative-id":["10.1145\/3539618.3592064","10.1145\/3539618"],"URL":"https:\/\/doi.org\/10.1145\/3539618.3592064","relation":{},"subject":[],"published":{"date-parts":[[2023,7,18]]},"assertion":[{"value":"2023-07-18","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}