{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,5]],"date-time":"2025-07-05T04:02:32Z","timestamp":1751688152537,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":66,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,25]],"date-time":"2025-04-25T00:00:00Z","timestamp":1745539200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,26]]},"DOI":"10.1145\/3706598.3713857","type":"proceedings-article","created":{"date-parts":[[2025,4,24]],"date-time":"2025-04-24T03:17:03Z","timestamp":1745464623000},"page":"1-17","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["VidSTR: Automatic Spatiotemporal Retargeting of Speech-Driven Video Compositions"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0229-0836","authenticated-orcid":false,"given":"Joshua Kong","family":"Yang","sequence":"first","affiliation":[{"name":"Brown University, Providence, Rhode Island, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8070-4918","authenticated-orcid":false,"given":"Mackenzie","family":"Leake","sequence":"additional","affiliation":[{"name":"Adobe Research, San Francisco, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3453-5666","authenticated-orcid":false,"given":"Jeff","family":"Huang","sequence":"additional","affiliation":[{"name":"Brown University, Providence, Rhode Island, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6694-3381","authenticated-orcid":false,"given":"Stephen","family":"DiVerdi","sequence":"additional","affiliation":[{"name":"Adobe Research, San Francisco, California, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,4,25]]},"reference":[{"volume-title":"Speechmatics","year":"2024","key":"e_1_3_3_2_2_2","unstructured":"2024. Speechmatics. https:\/\/www.speechmatics.com\/ Accessed: 2024-08."},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"crossref","unstructured":"Ido Arev Hyun\u00a0Soo Park Yaser Sheikh Jessica Hodgins and Ariel Shamir. 2014. Automatic editing of footage from multiple social cameras. ACM Transactions on Graphics (TOG) 33 4 (2014) 1\u201311.","DOI":"10.1145\/2601097.2601198"},{"key":"e_1_3_3_2_4_2","volume-title":"Grammar of the film language","author":"Arijon Daniel","year":"1976","unstructured":"Daniel Arijon. 1976. Grammar of the film language. Silman James Press."},{"key":"e_1_3_3_2_5_2","volume-title":"Art and visual perception: A psychology of the creative eye","author":"Arnheim Rudolf","year":"1954","unstructured":"Rudolf Arnheim. 1954. Art and visual perception: A psychology of the creative eye. Univ of California Press."},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.1145\/3596711.3596776"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","unstructured":"Greg\u00a0J. Badros Alan Borning and Peter\u00a0J. Stuckey. 2001. The Cassowary linear arithmetic constraint solving algorithm. ACM Trans. Comput.-Hum. Interact. 8 4 (dec 2001) 267\u2013306. 10.1145\/504704.504705","DOI":"10.1145\/504704.504705"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.5555\/548834"},{"key":"e_1_3_3_2_9_2","volume-title":"Creative Motion Graphic Titling for Film, Video, and the Web","author":"Braha Y.","year":"2011","unstructured":"Y. Braha and B. Byrne. 2011. Creative Motion Graphic Titling for Film, Video, and the Web. Focal Press. https:\/\/books.google.com\/books?id=cP398FPOy60C"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"crossref","unstructured":"Stuart\u00a0K Card Jock\u00a0D Mackinlay and George\u00a0G Robertson. 1991. A morphological analysis of the design space of input devices. ACM Transactions on Information Systems (TOIS) 9 2 (1991) 99\u2013122.","DOI":"10.1145\/123078.128726"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/3526113.3545676"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3472749.3474778"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/2501988.2502052"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415814"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/3397481.3450652"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376553"},{"key":"e_1_3_3_2_17_2","unstructured":"FX Elements. 2024. Getting Started with Video Overlays. https:\/\/www.fxelements.com\/guides\/getting-started-with-video-overlays Accessed: 2024-12."},{"key":"e_1_3_3_2_18_2","first-page":"6","volume-title":"BMVC","author":"Everingham Mark","year":"2006","unstructured":"Mark Everingham, Josef Sivic, and Andrew Zisserman. 2006. \u201cHello! My name is... Buffy\u201d\u2013Automatic Naming of Characters in TV Video.. In BMVC , Vol.\u00a02. Citeseer, 6."},{"key":"e_1_3_3_2_19_2","unstructured":"Marshal Carper. Social\u00a0Media Examiner.2019. A 6-Step Workflow to Create Video for Multiple Platforms. https:\/\/www.socialmediaexaminer.com\/6-step-workflow-create-video-multiple-platforms\/ Accessed: 2024-12."},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISMAR.2014.6948429"},{"volume-title":"MediaPipe","year":"2024","key":"e_1_3_3_2_21_2","unstructured":"Google. 2024. MediaPipe. https:\/\/github.com\/google-ai-edge\/mediapipe Accessed: 2024-09."},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/CISP.2008.714"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"crossref","unstructured":"Stephen\u00a0M Hart and Liu Yi-Hsin. 1995. The application of integer linear programming to the implementation of a graphical user interface: a new rectangular packing problem. Applied mathematical modelling 19 4 (1995) 244\u2013254.","DOI":"10.1016\/0307-904X(94)00033-3"},{"key":"e_1_3_3_2_24_2","volume-title":"HiGHS: High-performance parallel linear optimization software","author":"Huangfu Qi","year":"2024","unstructured":"Qi Huangfu, Lukas Schork, Michael Feldmeier, Leona Gottwald, Julian Hall, and Ivet Galabova. 2024. HiGHS: High-performance parallel linear optimization software. University of Edinburgh. https:\/\/highs.dev Version 1.5.3."},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300311"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","unstructured":"Eakta Jain Yaser Sheikh Ariel Shamir and Jessica Hodgins. 2015. Gaze-Driven Video Re-Editing. ACM Trans. Graph. 34 2 Article 21 (mar 2015) 12\u00a0pages. 10.1145\/2699644","DOI":"10.1145\/2699644"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300643"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","unstructured":"Topi Kaaresoja Stephen Brewster and Vuokko Lantz. 2014. Towards the Temporally Perfect Virtual Button: Touch-Feedback Simultaneity and Perceived Quality in Mobile Touchscreen Press Interactions. ACM Trans. Appl. Percept. 11 2 Article 9 (jun 2014) 25\u00a0pages. 10.1145\/2611387","DOI":"10.1145\/2611387"},{"key":"e_1_3_3_2_29_2","volume-title":"Film Directing Shot by Shot: Visualizing from Concept to Screen","author":"Katz Steven\u00a0D","year":"1991","unstructured":"Steven\u00a0D Katz. 1991. Film Directing Shot by Shot: Visualizing from Concept to Screen. Michael Wiese Productions."},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","unstructured":"Johannes Kiess Benjamin Guthier Stephan Kopf and Wolfgang Effelsberg. 2012. SeamCrop: changing the size and aspect ratio of videos(MoVid \u201912). Association for Computing Machinery New York NY USA 13\u201318. 10.1145\/2151677.2151681","DOI":"10.1145\/2151677.2151681"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","unstructured":"Jeongyeon Kim Yubin Choi Minsuk Kahng and Juho Kim. 2022. FitVid: Responsive and Flexible Video Content Adaptation(CHI \u201922). Association for Computing Machinery New York NY USA Article 501 16\u00a0pages. 10.1145\/3491102.3501948","DOI":"10.1145\/3491102.3501948"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/2702123.2702507"},{"key":"e_1_3_3_2_33_2","first-page":"957","volume-title":"International conference on machine learning","author":"Kusner Matt","year":"2015","unstructured":"Matt Kusner, Yu Sun, Nicholas Kolkin, and Kilian Weinberger. 2015. From word embeddings to document distances. In International conference on machine learning. PMLR, 957\u2013966."},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","unstructured":"Mackenzie Leake Abe Davis Anh Truong and Maneesh Agrawala. 2017. Computational video editing for dialogue-driven scenes. ACM Trans. Graph. 36 4 Article 130 (jul 2017) 14\u00a0pages. 10.1145\/3072959.3073653","DOI":"10.1145\/3072959.3073653"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642667"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"crossref","unstructured":"Mackenzie Leake Hijung\u00a0Valentina Shin Joy\u00a0O Kim and Maneesh Agrawala. 2020. Generating Audio-Visual Slideshows from Text Articles Using Word Concreteness. Proceedings of the 2020 CHI Conference on Human Factors in Computing Systems (2020) 1\u201311.","DOI":"10.1145\/3313831.3376519"},{"key":"e_1_3_3_2_37_2","unstructured":"Vladimir Likic. 2008. The Needleman-Wunsch algorithm for sequence alignment. Lecture given at the 7th Melbourne Bioinformatics Course Bi021 Molecular Science and Biotechnology Institute University of Melbourne (2008) 1\u201346."},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581566"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"crossref","unstructured":"Meinard M\u00fcller. 2007. Dynamic time warping. Information retrieval for music and motion (2007) 69\u201384.","DOI":"10.1007\/978-3-540-74048-3_4"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.5555\/2821575"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3580873"},{"volume-title":"GPT-4","year":"2023","key":"e_1_3_3_2_42_2","unstructured":"OpenAI. 2023. GPT-4. https:\/\/openai.com\/gpt-4 Accessed: 2024-08."},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"publisher","DOI":"10.1145\/2807442.2807502"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/2642918.2647400"},{"key":"e_1_3_3_2_45_2","unstructured":"Steven Piantadosi. 2023. Modern language models refute Chomsky\u2019s approach to language. Lingbuzz Preprint lingbuzz 7180 (2023)."},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"publisher","DOI":"10.1145\/2501988.2501993"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300852"},{"key":"e_1_3_3_2_48_2","unstructured":"Pavel Senin. 2008. Dynamic time warping algorithm review. Information and Computer Science Department University of Hawaii at Manoa Honolulu USA 855 1-23 (2008) 40."},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","DOI":"10.1145\/3505284.3529970"},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"crossref","unstructured":"Takehide Soh Katsumi Inoue Naoyuki Tamura Mutsunori Banbara and Hidetomo Nabeshima. 2010. A SAT-based method for solving the two-dimensional strip packing problem. Fundamenta Informaticae 102 3-4 (2010) 467\u2013487.","DOI":"10.3233\/FI-2010-314"},{"key":"e_1_3_3_2_51_2","doi-asserted-by":"crossref","unstructured":"Ralf Steinmetz. 1996. Human perception of jitter and media synchronization. IEEE Journal on selected Areas in Communications 14 1 (1996) 61\u201372.","DOI":"10.1109\/49.481694"},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376593"},{"key":"e_1_3_3_2_53_2","volume-title":"Grammar of the Edit","author":"Thompson Roy","year":"2009","unstructured":"Roy Thompson and Christopher\u00a0J Bowen. 2009. Grammar of the Edit. Taylor & Francis."},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.1145\/3640543.3645164"},{"key":"e_1_3_3_2_55_2","first-page":"14","volume-title":"Graphics Interface","author":"Truong Anh","year":"2019","unstructured":"Anh Truong and Maneesh Agrawala. 2019. A Tool for Navigating and Editing 360 Video of Social Conversations into Shareable Highlights.. In Graphics Interface. 14\u20131."},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"publisher","DOI":"10.1145\/2984511.2984569"},{"key":"e_1_3_3_2_57_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445721"},{"key":"e_1_3_3_2_58_2","volume-title":"Tolerable round-trip time delay for sound-programme and television broadcast programme inserts","author":"Union International\u00a0Telecommunication","year":"2004","unstructured":"International\u00a0Telecommunication Union. 2004. Tolerable round-trip time delay for sound-programme and television broadcast programme inserts. Technical Report. United Nations Economic and Social Council."},{"key":"e_1_3_3_2_59_2","doi-asserted-by":"publisher","DOI":"10.1145\/3229434.3229465"},{"key":"e_1_3_3_2_60_2","doi-asserted-by":"publisher","DOI":"10.1145\/3640543.3645143"},{"key":"e_1_3_3_2_61_2","doi-asserted-by":"publisher","unstructured":"Oliver Wang Christopher Schroers Henning Zimmer Markus Gross and Alexander Sorkine-Hornung. 2014. VideoSnapping: interactive synchronization of multiple videos. ACM Trans. Graph. 33 4 Article 77 (jul 2014) 10\u00a0pages. 10.1145\/2601097.2601208","DOI":"10.1145\/2601097.2601208"},{"key":"e_1_3_3_2_62_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642868"},{"volume-title":"Old-Fashioned Apple Pie","year":"1964","key":"e_1_3_3_2_63_2","unstructured":"Wikipedia. 1964. Old-Fashioned Apple Pie. Public Domain Recepies."},{"volume-title":"Lemon","year":"2024","key":"e_1_3_3_2_64_2","unstructured":"Wikipedia. 2024. Lemon. Wikimedia Foundation."},{"key":"e_1_3_3_2_65_2","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415882"},{"key":"e_1_3_3_2_66_2","doi-asserted-by":"publisher","DOI":"10.1121\/1.2935783"},{"key":"e_1_3_3_2_67_2","doi-asserted-by":"publisher","DOI":"10.1145\/2501988.2502007"}],"event":{"name":"CHI 2025: CHI Conference on Human Factors in Computing Systems","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"],"location":"Yokohama Japan","acronym":"CHI '25"},"container-title":["Proceedings of the 2025 CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3706598.3713857","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3706598.3713857","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,4]],"date-time":"2025-07-04T05:02:50Z","timestamp":1751605370000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3706598.3713857"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,25]]},"references-count":66,"alternative-id":["10.1145\/3706598.3713857","10.1145\/3706598"],"URL":"https:\/\/doi.org\/10.1145\/3706598.3713857","relation":{},"subject":[],"published":{"date-parts":[[2025,4,25]]},"assertion":[{"value":"2025-04-25","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}