{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,15]],"date-time":"2025-10-15T10:37:52Z","timestamp":1760524672675,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62102031"],"award-info":[{"award-number":["62102031"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"the foundation of Key Laboratory of Artificial Intelligence, Ministry of Education, Shanghai, P.R. China","award":["AI202409"],"award-info":[{"award-number":["AI202409"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680751","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"9739-9748","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["An Inverse Partial Optimal Transport Framework for Music-guided Trailer Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-2225-5037","authenticated-orcid":false,"given":"Yutong","family":"Wang","sequence":"first","affiliation":[{"name":"Beijing Institute of Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1564-5126","authenticated-orcid":false,"given":"Sidan","family":"Zhu","sequence":"additional","affiliation":[{"name":"Beijing Institute of Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4192-5360","authenticated-orcid":false,"given":"Hongteng","family":"Xu","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1136-8903","authenticated-orcid":false,"given":"Dixin","family":"Luo","sequence":"additional","affiliation":[{"name":"Beijing Institute of Technology &amp; Key Laboratory of Artificial Intelligence, Ministry of Education, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Find the Cliffhanger: Multi-modal Trailerness in Soap Operas. In International Conference on Multimedia Modeling. Springer, 199--212","author":"Bretti Carlo","year":"2024","unstructured":"Carlo Bretti, Pascal Mettes, Hendrik Vincent Koops, Daan Odijk, and Nanne van Noord. 2024. Find the Cliffhanger: Multi-modal Trailerness in Soap Operas. In International Conference on Multimedia Modeling. Springer, 199--212."},{"key":"e_1_3_2_1_2_1","volume-title":"International Conference on Machine Learning. PMLR, 1542--1553","author":"Chen Liqun","year":"2020","unstructured":"Liqun Chen, Zhe Gan, Yu Cheng, Linjie Li, Lawrence Carin, and Jingjing Liu. 2020. Graph optimal transport for cross-domain alignment. In International Conference on Machine Learning. PMLR, 1542--1553."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_2_1_4_1","volume-title":"DHOT-GM: Robust Graph Matching Using A Differentiable Hierarchical Optimal Transport Framework. arXiv preprint arXiv:2310.12081","author":"Cheng Haoran","year":"2023","unstructured":"Haoran Cheng, Dixin Luo, and Hongteng Xu. 2023. DHOT-GM: Robust Graph Matching Using A Differentiable Hierarchical Optimal Transport Framework. arXiv preprint arXiv:2310.12081 (2023)."},{"key":"e_1_3_2_1_5_1","volume-title":"International Conference on Machine Learning. PMLR, 3925--3946","author":"Chiu Wei-Ting","year":"2022","unstructured":"Wei-Ting Chiu, Pei Wang, and Patrick Shafto. 2022. Discrete probabilistic inverse optimal transport. In International Conference on Machine Learning. PMLR, 3925--3946."},{"key":"e_1_3_2_1_6_1","volume-title":"Sinkhorn distances: Lightspeed computation of optimal transport. Advances in neural information processing systems","author":"Cuturi Marco","year":"2013","unstructured":"Marco Cuturi. 2013. Sinkhorn distances: Lightspeed computation of optimal transport. Advances in neural information processing systems, Vol. 26 (2013)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/957013.957124"},{"key":"e_1_3_2_1_8_1","unstructured":"Matthijs Douze Alexandr Guzhva Chengqi Deng Jeff Johnson Gergely Szilvasy Pierre-Emmanuel Mazar\u00e9 Maria Lomeli Lucas Hosseini and Herv\u00e9 J\u00e9gou. 2024. The Faiss library. (2024). arxiv: 2401.08281 [cs.LG]"},{"key":"e_1_3_2_1_9_1","volume-title":"Asian Conference on Computer Vision. Springer, 39--54","author":"Fajtl Jiri","year":"2018","unstructured":"Jiri Fajtl, Hajar Sadeghi Sokeh, Vasileios Argyriou, Dorothy Monekosso, and Paolo Remagnino. 2018. Summarizing videos with attention. In Asian Conference on Computer Vision. Springer, 39--54."},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings, Part IV 16","author":"Gabeur Valentin","year":"2020","unstructured":"Valentin Gabeur, Chen Sun, Karteek Alahari, and Cordelia Schmid. 2020. Multi-modal transformer for video retrieval. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part IV 16. Springer, 214--229."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01812"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISM.2014.49"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2729019"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3511808.3557339"},{"key":"e_1_3_2_1_16_1","first-page":"3988","article-title":"Deep declarative networks","volume":"44","author":"Gould Stephen","year":"2021","unstructured":"Stephen Gould, Richard Hartley, and Dylan Campbell. 2021. Deep declarative networks. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 44, 8 (2021), 3988--4004.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_17_1","unstructured":"Jonathan Ho William Chan Chitwan Saharia Jay Whang Ruiqi Gao Alexey Gritsenko Diederik P Kingma Ben Poole Mohammad Norouzi David J Fleet et al. 2022. Imagen video: High definition video generation with diffusion models. arXiv preprint arXiv:2210.02303 (2022)."},{"key":"e_1_3_2_1_18_1","volume-title":"From trailers to storylines: An efficient way to learn from movies. arXiv preprint arXiv:1806.05341","author":"Huang Qingqiu","year":"2018","unstructured":"Qingqiu Huang, Yuanjun Xiong, Yu Xiong, Yuqi Zhang, and Dahua Lin. 2018. From trailers to storylines: An efficient way to learn from movies. arXiv preprint arXiv:1806.05341 (2018)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/1873951.1874092"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TBDATA.2019.2921572"},{"key":"e_1_3_2_1_21_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00999"},{"key":"e_1_3_2_1_23_1","volume-title":"Hierarchical optimal transport for multimodal distribution alignment. Advances in neural information processing systems","author":"Lee John","year":"2019","unstructured":"John Lee, Max Dabagia, Eva Dyer, and Christopher Rozell. 2019. Hierarchical optimal transport for multimodal distribution alignment. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_24_1","first-page":"1","article-title":"Learning to match via inverse optimal transport","volume":"20","author":"Li Ruilin","year":"2019","unstructured":"Ruilin Li, Xiaojing Ye, Haomin Zhou, and Hongyuan Zha. 2019. Learning to match via inverse optimal transport. Journal of machine learning research, Vol. 20, 80 (2019), 1--37.","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11390-023-3064-6"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548067"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3222569"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/641007.641116"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01437"},{"key":"e_1_3_2_1_30_1","volume-title":"Boundary-aware self-supervised learning for video scene segmentation. arXiv preprint arXiv:2201.05277","author":"Mun Jonghwan","year":"2022","unstructured":"Jonghwan Mun, Minchul Shin, Gunsoo Han, Sangho Lee, Seongsu Ha, Joonseok Lee, and Eun-Sol Kim. 2022. Boundary-aware self-supervised learning for video scene segmentation. arXiv preprint arXiv:2201.05277 (2022)."},{"key":"e_1_3_2_1_31_1","volume-title":"arXiv preprint arXiv:2208.06773","author":"Narasimhan Medhini","year":"2022","unstructured":"Medhini Narasimhan, Arsha Nagrani, Chen Sun, Michael Rubinstein, Trevor Darrell, Anna Rohrbach, and Cordelia Schmid. 2022. TL; DW? Summarizing Instructional Videos with Task Relevance & Cross-Modal Saliency. arXiv preprint arXiv:2208.06773 (2022)."},{"key":"e_1_3_2_1_32_1","volume-title":"Advances in neural information processing systems","author":"Narasimhan Medhini","year":"2021","unstructured":"Medhini Narasimhan, Anna Rohrbach, and Trevor Darrell. 2021. Clip-it! language-guided video summarization. Advances in neural information processing systems, Vol. 34 (2021), 13988--14000."},{"key":"e_1_3_2_1_33_1","volume-title":"Semantic video trailers. arXiv preprint arXiv:1609.01819","author":"Oosterhuis Harrie","year":"2016","unstructured":"Harrie Oosterhuis, Sujith Ravi, and Michael Bendersky. 2016. Semantic video trailers. arXiv preprint arXiv:1609.01819 (2016)."},{"key":"e_1_3_2_1_34_1","volume-title":"Movie plot analysis via turning point identification. arXiv preprint arXiv:1908.10328","author":"Papalampidi Pinelopi","year":"2019","unstructured":"Pinelopi Papalampidi, Frank Keller, and Mirella Lapata. 2019. Movie plot analysis via turning point identification. arXiv preprint arXiv:1908.10328 (2019)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i15.17607"},{"key":"e_1_3_2_1_36_1","volume-title":"Giovanni Chierchia, and Pascal Frossard.","author":"Maretic Hermina Petric","year":"2019","unstructured":"Hermina Petric Maretic, Mireille El Gheche, Giovanni Chierchia, and Pascal Frossard. 2019. GOT: an optimal transport framework for graph comparison. Advances in Neural Information Processing Systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58604-1_32"},{"key":"e_1_3_2_1_38_1","first-page":"5373","article-title":"Accurate point cloud registration with robust optimal transport","volume":"34","author":"Shen Zhengyang","year":"2021","unstructured":"Zhengyang Shen, Jean Feydy, Peirong Liu, Ariel H Curiale, Ruben San Jose Estepar, Raul San Jose Estepar, and Marc Niethammer. 2021. Accurate point cloud registration with robust optimal transport. Advances in Neural Information Processing Systems, Vol. 34 (2021), 5373--5389.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_39_1","volume-title":"Make-a-video: Text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792","author":"Singer Uriel","year":"2022","unstructured":"Uriel Singer, Adam Polyak, Thomas Hayes, Xi Yin, Jie An, Songyang Zhang, Qiyuan Hu, Harry Yang, Oron Ashual, Oran Gafni, et al. 2022. Make-a-video: Text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792 (2022)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3127906"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1137\/19M1261122"},{"key":"e_1_3_2_1_42_1","unstructured":"Domen Tabernik Alan Lukezic and Klemen Grm. [n. d.]. movie2trailer: Unsupervised trailer generation using Anomaly detection. ( [n. d.])."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.sigpro.2019.107299"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"C\u00e9dric Villani et al. 2009. Optimal transport: old and new. Vol. 338. Springer.","DOI":"10.1007\/978-3-540-71050-9"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00795"},{"key":"e_1_3_2_1_46_1","volume-title":"Advances in Neural Information Processing Systems","volume":"27","author":"Wang Huahua","year":"2014","unstructured":"Huahua Wang and Arindam Banerjee. 2014. Bregman alternating direction method of multipliers. Advances in Neural Information Processing Systems, Vol. 27 (2014)."},{"key":"e_1_3_2_1_47_1","volume-title":"Proceedings, Part XVIII 16","author":"Wang Lezi","year":"2020","unstructured":"Lezi Wang, Dong Liu, Rohit Puri, and Dimitris N Metaxas. 2020. Learning trailer moments in full-length movies with co-contrastive attention. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XVIII 16. Springer, 300--316."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548098"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612087"},{"key":"e_1_3_2_1_50_1","volume-title":"International Conference on Learning Representations.","author":"Xie Yujia","year":"2020","unstructured":"Yujia Xie, Yixiu Mao, Simiao Zuo, Hongteng Xu, Xiaojing Ye, Tuo Zhao, and Hongyuan Zha. 2020. A hypergradient approach to robust regression without correspondence. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_51_1","unstructured":"Yujia Xie Xiangfeng Wang Ruijia Wang and Hongyuan Zha. 2020. A fast proximal point method for computing exact wasserstein distance. In Uncertainty in artificial intelligence. PMLR 433--453."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.6120"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3314661"},{"key":"e_1_3_2_1_54_1","volume-title":"International conference on machine learning. PMLR, 6932--6941","author":"Xu Hongteng","year":"2019","unstructured":"Hongteng Xu, Dixin Luo, Hongyuan Zha, and Lawrence Carin Duke. 2019. Gromov-wasserstein learning for graph matching and node embedding. In International conference on machine learning. PMLR, 6932--6941."},{"key":"e_1_3_2_1_55_1","volume-title":"Twenty-Fourth International Joint Conference on Artificial Intelligence.","author":"Xu Hongteng","year":"2015","unstructured":"Hongteng Xu, Yi Zhen, and Hongyuan Zha. 2015. Trailer generation via a point process-based visual attractiveness model. In Twenty-Fourth International Joint Conference on Artificial Intelligence."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"crossref","unstructured":"Renjun Xu Pelen Liu Yin Zhang Fang Cai Jindong Wang Shuoying Liang Heting Ying and Jianwei Yin. 2020. Joint Partial Optimal Transport for Open Set Domain Adaptation.. In IJCAI. 2540--2546.","DOI":"10.24963\/ijcai.2020\/352"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531974"},{"key":"e_1_3_2_1_58_1","volume-title":"Proceedings, Part VIII 16","author":"Zhao Peisen","year":"2020","unstructured":"Peisen Zhao, Lingxi Xie, Chen Ju, Ya Zhang, Yanfeng Wang, and Qi Tian. 2020. Bottom-up temporal action localization with mutual regularization. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part VIII 16. Springer, 539--555."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-017-1033-7"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01326"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680751","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680751","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:42Z","timestamp":1750294662000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680751"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":60,"alternative-id":["10.1145\/3664647.3680751","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680751","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}