{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,17]],"date-time":"2026-04-17T16:20:09Z","timestamp":1776442809979,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":64,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,24]],"date-time":"2025-03-24T00:00:00Z","timestamp":1742774400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,24]]},"DOI":"10.1145\/3708359.3712113","type":"proceedings-article","created":{"date-parts":[[2025,3,19]],"date-time":"2025-03-19T12:50:34Z","timestamp":1742388634000},"page":"609-623","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["EditIQ: Automated Cinematic Editing of Static Wide-Angle Videos via Dialogue Interpretation and Saliency Cues"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-3361-5894","authenticated-orcid":false,"given":"Rohit","family":"Girmaji","sequence":"first","affiliation":[{"name":"CVIT, International Institute of Information Technology, Hyderabad, Hyderabad, Telangana, India,"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5064-8725","authenticated-orcid":false,"given":"Bhav","family":"Beri","sequence":"additional","affiliation":[{"name":"CVIT, International Institute of Information Technology, Hyderabad, Hyderabad, Telangana, India,"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9441-7074","authenticated-orcid":false,"given":"Ramanathan","family":"Subramanian","sequence":"additional","affiliation":[{"name":"University of Canberra, Canberra, ACT, Australia,"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8861-7731","authenticated-orcid":false,"given":"Vineet","family":"Gandhi","sequence":"additional","affiliation":[{"name":"CVIT, International Institute of Information Technology, Hyderabad, Hyderabad, Telangana, India,"}]}],"member":"320","published-online":{"date-parts":[[2025,3,24]]},"reference":[{"key":"e_1_3_3_3_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00406"},{"key":"e_1_3_3_3_3_2","unstructured":"Nir Aharon Roy Orfaig and Ben-Zion Bobrovsky. 2022. BoT-SORT: Robust Associations Multi-Pedestrian Tracking. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2206.14651 (2022)."},{"key":"e_1_3_3_3_4_2","unstructured":"Anthropic. 2024. The Claude 3 Model Family: Opus Sonnet Haiku. https:\/\/assets.anthropic.com\/m\/61e7d27f8c8f5919\/original\/Claude-3-Model-Card.pdf"},{"key":"e_1_3_3_3_5_2","doi-asserted-by":"crossref","unstructured":"Ido Arev Hyun\u00a0Soo Park Yaser Sheikh Jessica Hodgins and Ariel Shamir. 2014. Automatic editing of footage from multiple social cameras. ACM Transactions on Graphics (TOG) 33 4 (2014) 81.","DOI":"10.1145\/2601097.2601198"},{"key":"e_1_3_3_3_6_2","unstructured":"Daniel Arijon. 1976. Grammar of the film language. (1976)."},{"key":"e_1_3_3_3_7_2","doi-asserted-by":"crossref","unstructured":"Max Bain Jaesung Huh Tengda Han and Andrew Zisserman. 2023. WhisperX: Time-Accurate Speech Transcription of Long-Form Audio. INTERSPEECH 2023 (2023).","DOI":"10.21437\/Interspeech.2023-78"},{"key":"e_1_3_3_3_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00934"},{"key":"e_1_3_3_3_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/2502081.2502086"},{"key":"e_1_3_3_3_10_2","unstructured":"Qinyao Chang and Shiping Zhu. 2021. Temporal-spatial feature pyramid for video saliency detection. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2105.04213 (2021)."},{"key":"e_1_3_3_3_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2013.6607445"},{"key":"e_1_3_3_3_12_2","doi-asserted-by":"crossref","unstructured":"Fan Chen and Christophe De\u00a0Vleeschouwer. 2010. Personalized production of basketball videos from multi-sensored data under limited display resolution. Computer Vision and Image Understanding 114 6 (2010) 667\u2013680.","DOI":"10.1016\/j.cviu.2010.01.005"},{"key":"e_1_3_3_3_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2018.00053"},{"key":"e_1_3_3_3_14_2","first-page":"148","volume-title":"AAAI\/IAAI, Vol. 1","author":"Christianson David\u00a0B","year":"1996","unstructured":"David\u00a0B Christianson, Sean\u00a0E Anderson, Li-wei He, David\u00a0H Salesin, Daniel\u00a0S Weld, and Michael\u00a0F Cohen. 1996. Declarative camera control for automatic cinematography. In AAAI\/IAAI, Vol. 1. 148\u2013155."},{"key":"e_1_3_3_3_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/EUSIPCO.2015.7362640"},{"key":"e_1_3_3_3_16_2","doi-asserted-by":"crossref","unstructured":"James Cutting and Ayse Candan\u00a0Simsek. 2015. Shot Durations Shot Classes and the Increased Pace of Popular Movies. Projections 9 (12 2015) 40\u201352. https:\/\/doi.org\/10.3167\/proj.2015.090204","DOI":"10.3167\/proj.2015.090204"},{"key":"e_1_3_3_3_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_3_3_18_2","doi-asserted-by":"crossref","unstructured":"Vamsidhar\u00a0Reddy Gaddam Ragnhild Eg Ragnar Langseth Carsten Griwodz and P\u00e5l Halvorsen. 2015. The Cameraman Operating My Virtual Camera is Artificial: Can the Machine Be as Good as a Human? ACM Trans. Multimedia Comput. Commun. Appl. 11 4 Article 56 (June 2015) 20\u00a0pages.","DOI":"10.1145\/2744411"},{"key":"e_1_3_3_3_19_2","doi-asserted-by":"publisher","DOI":"10.5555\/2887007.2887112"},{"key":"e_1_3_3_3_20_2","first-page":"31","volume-title":"4th Workshop on Intelligent Camera Control, Cinematography and Editing","author":"Gandhi Vineet","year":"2015","unstructured":"Vineet Gandhi and R\u00e9mi Ronfard. 2015. A computational framework for vertical video editing. In 4th Workshop on Intelligent Camera Control, Cinematography and Editing. Eurographics Association, 31\u201337."},{"key":"e_1_3_3_3_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/2668904.2668936"},{"key":"e_1_3_3_3_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995525"},{"key":"e_1_3_3_3_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00633"},{"key":"e_1_3_3_3_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/237170.237259"},{"key":"e_1_3_3_3_25_2","doi-asserted-by":"crossref","unstructured":"Rachel Heck Michael Wallick and Michael Gleicher. 2007. Virtual videography. ACM Transactions on Multimedia Computing Communications and Applications (TOMM) 3 1 (2007) 4\u2013es.","DOI":"10.1145\/1198302.1198306"},{"key":"e_1_3_3_3_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.633"},{"key":"e_1_3_3_3_27_2","doi-asserted-by":"crossref","unstructured":"Eakta Jain Yaser Sheikh Ariel Shamir and Jessica Hodgins. 2015. Gaze-Driven Video Re-Editing. ACM Trans. Graph. 34 2 Article 21 (March 2015) 12\u00a0pages. https:\/\/doi.org\/10.1145\/2699644","DOI":"10.1145\/2699644"},{"key":"e_1_3_3_3_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/IROS51168.2021.9635989"},{"key":"e_1_3_3_3_29_2","volume-title":"Ultralytics YOLO","author":"Jocher Glenn","year":"2023","unstructured":"Glenn Jocher, Ayush Chaurasia, and Jing Qiu. 2023. Ultralytics YOLO. https:\/\/github.com\/ultralytics\/ultralytics"},{"key":"e_1_3_3_3_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3604321.3604374"},{"key":"e_1_3_3_3_31_2","doi-asserted-by":"crossref","unstructured":"Yueying Kao Bowen Pan Miao Xu Jiangjing Lyu Xiangyu Zhu Yuanzhang Chang Xiaobo Li and Zhen Lei. 2023. Toward 3d face reconstruction in perspective projection: Estimating 6dof face pose from monocular image. IEEE Transactions on Image Processing 32 (2023) 3080\u20133091.","DOI":"10.1109\/TIP.2023.3275535"},{"key":"e_1_3_3_3_32_2","doi-asserted-by":"crossref","unstructured":"Bruno Korbar Jaesung Huh and Andrew Zisserman. 2024. Look Listen and Recognise: character-aware audio-visual subtitling. (2024).","DOI":"10.1109\/ICASSP48485.2024.10446480"},{"key":"e_1_3_3_3_33_2","volume-title":"British Machine Vision Conference","author":"Korbar Bruno","year":"2022","unstructured":"Bruno Korbar and Andrew Zisserman. 2022. Personalised CLIP or: how to find your vacation videos. In British Machine Vision Conference."},{"key":"e_1_3_3_3_34_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-07515-0_19"},{"key":"e_1_3_3_3_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/1661412.1618472"},{"key":"e_1_3_3_3_36_2","doi-asserted-by":"crossref","unstructured":"Mackenzie Leake Abe Davis Anh Truong and Maneesh Agrawala. 2017. Computational video editing for dialogue-driven scenes. ACM Trans. Graph. 36 4 Article 130 (July 2017) 14\u00a0pages. https:\/\/doi.org\/10.1145\/3072959.3073653","DOI":"10.1145\/3072959.3073653"},{"key":"e_1_3_3_3_37_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-25289-1_35"},{"key":"e_1_3_3_3_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/1180639.1180702"},{"key":"e_1_3_3_3_39_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_25"},{"key":"e_1_3_3_3_40_2","doi-asserted-by":"publisher","DOI":"10.1111\/cgf.12775"},{"key":"e_1_3_3_3_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00248"},{"key":"e_1_3_3_3_42_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376544"},{"key":"e_1_3_3_3_43_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00053"},{"key":"e_1_3_3_3_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00053"},{"key":"e_1_3_3_3_45_2","doi-asserted-by":"crossref","unstructured":"Yingwei Pan Yue Chen Qian Bao Ning Zhang Ting Yao Jingen Liu and Tao Mei. 2021. Smart director: An event-driven directing system for live broadcasting. ACM Transactions on Multimedia Computing Communications and Applications (TOMM) 17 4 (2021) 1\u201318.","DOI":"10.1145\/3448981"},{"key":"e_1_3_3_3_46_2","volume-title":"Advances in Neural Information Processing Systems","author":"Park Hyun","year":"2012","unstructured":"Hyun Park, Eakta Jain, and Yaser Sheikh. 2012. 3D Social Saliency from Head-mounted Cameras. In Advances in Neural Information Processing Systems , F.\u00a0Pereira, C.J. Burges, L.\u00a0Bottou, and K.Q. Weinberger (Eds.), Vol.\u00a025. Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2012\/file\/1bf2efbbe0c49b9f567c2e40f645279a-Paper.pdf"},{"key":"e_1_3_3_3_47_2","doi-asserted-by":"crossref","unstructured":"Minglang Qiao Yufan Liu Mai Xu Xin Deng Bing Li Weiming Hu and Ali Borji. 2023. Joint Learning of Audio-Visual Saliency Prediction and Sound Source Localization on Multi-face Videos. 132 (2023) 2003\u20132025.","DOI":"10.1007\/s11263-023-01950-3"},{"key":"e_1_3_3_3_48_2","doi-asserted-by":"publisher","DOI":"10.1111\/cgf.13354"},{"key":"e_1_3_3_3_49_2","doi-asserted-by":"publisher","DOI":"10.1145\/1357054.1357095"},{"key":"e_1_3_3_3_50_2","volume-title":"Proceedings of International Broadcast Conference (IBC 2010)","volume":"1","author":"Sch\u00e4fer Ralf","year":"2010","unstructured":"Ralf Sch\u00e4fer, Peter Kauff, and Christian Weissig. 2010. Ultra high resolution video production and display as basis of a format agnostic production system. In Proceedings of International Broadcast Conference (IBC 2010) , Vol.\u00a01."},{"key":"e_1_3_3_3_51_2","first-page":"154","volume-title":"Asian Conference on Computer Vision","author":"Su Yu-Chuan","year":"2016","unstructured":"Yu-Chuan Su, Dinesh Jayaraman, and Kristen Grauman. 2016. Pano2vid: Automatic cinematography for watching 360 videos. In Asian Conference on Computer Vision. Springer, 154\u2013171."},{"key":"e_1_3_3_3_52_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.652"},{"key":"e_1_3_3_3_53_2","doi-asserted-by":"crossref","unstructured":"Chengzhou Tang Oliver Wang Feng Liu and Ping Tan. 2019. Joint stabilization and direction of 360 videos. ACM Transactions on Graphics (TOG) 38 2 (2019) 1\u201313.","DOI":"10.1145\/3211889"},{"key":"e_1_3_3_3_54_2","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475587"},{"key":"e_1_3_3_3_55_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00482"},{"key":"e_1_3_3_3_56_2","doi-asserted-by":"publisher","DOI":"10.1109\/ADICS58448.2024.10533619"},{"key":"e_1_3_3_3_57_2","doi-asserted-by":"crossref","unstructured":"Krishnapriya Vishnubhotla Adam Hammond Graeme Hirst and Saif\u00a0M Mohammad. 2024. The Emotion Dynamics of Literary Novels. ACL (2024).","DOI":"10.18653\/v1\/2024.findings-acl.150"},{"key":"e_1_3_3_3_58_2","doi-asserted-by":"crossref","unstructured":"Jinjun Wang Changsheng Xu Engsiong Chng Hanqing Lu and Qi Tian. 2008. Automatic composition of broadcast sports video. Multimedia Systems 14 4 (2008) 179\u2013193.","DOI":"10.1007\/s00530-008-0112-6"},{"key":"e_1_3_3_3_59_2","doi-asserted-by":"crossref","unstructured":"Wenguan Wang Jianbing Shen Jianwen Xie Ming-Ming Cheng Haibin Ling and Ali Borji. 2019. Revisiting video saliency prediction in the deep learning era. TPAMI 43 1 (2019) 220\u2013237.","DOI":"10.1109\/TPAMI.2019.2924417"},{"key":"e_1_3_3_3_60_2","doi-asserted-by":"publisher","DOI":"10.1145\/1833349.1778827"},{"key":"e_1_3_3_3_61_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01267-0_19"},{"key":"e_1_3_3_3_62_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00623"},{"key":"e_1_3_3_3_63_2","doi-asserted-by":"crossref","unstructured":"Cha Zhang Yong Rui Jim Crawford and Li-Wei He. 2008. An automated end-to-end lecture capture and broadcasting system. ACM Transactions on multimedia computing communications and applications (TOMM) 4 1 (2008) 1\u201323.","DOI":"10.1145\/1324287.1324293"},{"key":"e_1_3_3_3_64_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00716"},{"key":"e_1_3_3_3_65_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.248"}],"event":{"name":"IUI '25: 30th International Conference on Intelligent User Interfaces","location":"Cagliari Italy","acronym":"IUI '25","sponsor":["SIGAI ACM Special Interest Group on Artificial Intelligence","SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 30th International Conference on Intelligent User Interfaces"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3708359.3712113","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3708359.3712113","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:57:06Z","timestamp":1750298226000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3708359.3712113"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,24]]},"references-count":64,"alternative-id":["10.1145\/3708359.3712113","10.1145\/3708359"],"URL":"https:\/\/doi.org\/10.1145\/3708359.3712113","relation":{},"subject":[],"published":{"date-parts":[[2025,3,24]]},"assertion":[{"value":"2025-03-24","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}