{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,21]],"date-time":"2026-01-21T22:47:31Z","timestamp":1769035651180,"version":"3.49.0"},"reference-count":16,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T00:00:00Z","timestamp":1761091200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T00:00:00Z","timestamp":1761091200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100005416","name":"Research Council of Norway","doi-asserted-by":"publisher","award":["354154"],"award-info":[{"award-number":["354154"]}],"id":[{"id":"10.13039\/501100005416","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,22]]},"DOI":"10.1109\/cbmi66578.2025.11339302","type":"proceedings-article","created":{"date-parts":[[2026,1,20]],"date-time":"2026-01-20T20:38:56Z","timestamp":1768941536000},"page":"1-5","source":"Crossref","is-referenced-by-count":0,"title":["VoiceVision: AI-Powered Speaker-Aware Cropping and Content Indexing for Multi-Speaker Videos"],"prefix":"10.1109","author":[{"given":"Mehdi Houshmand","family":"Sarkhoosh","sequence":"first","affiliation":[{"name":"OsloMet &#x0026; Forzasys,Oslo,Norway"}]},{"given":"Cise","family":"Midoglu","sequence":"additional","affiliation":[{"name":"Forzasys,Oslo,Norway"}]},{"given":"Saeed S.","family":"Sabet","sequence":"additional","affiliation":[{"name":"Forzasys,Oslo,Norway"}]},{"given":"Tomas","family":"Kupka","sequence":"additional","affiliation":[{"name":"Forzasys,Oslo,Norway"}]},{"given":"P\u00e5l","family":"Halvorsen","sequence":"additional","affiliation":[{"name":"SimulaMet, OsloMet &#x0026; Forzasys,Oslo,Norway"}]}],"member":"263","reference":[{"key":"ref1","author":"Sarkhoosh","year":"2023","journal-title":"Soccer on social media"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ISM59092.2023.00009"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-53302-0_22"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2024.3449152"},{"key":"ref5","doi-asserted-by":"crossref","DOI":"10.1109\/ICASSPW62465.2024.10626914","article-title":"Late audio-visual fusion for in-the-wild speaker diarization","volume-title":"ICASSP 2024 Workshop","author":"Pan"},{"key":"ref6","article-title":"Misp 2022 challenge: Audio-visual speaker diarization in realistic environments","volume-title":"ICASSP 2023","author":"Organizers","year":"2023"},{"key":"ref7","article-title":"Ego4d audio-visual diarization challenge 2023","volume-title":"CVPR 2023","author":"Consortium","year":"2023"},{"key":"ref8","article-title":"As-net: Active speaker detection using deep audio-visual attention","volume-title":"Multimedia Tools and Applications","author":"Gil Gomez","year":"2024"},{"key":"ref9","article-title":"Talknet: Efficient audio-visual speaker diarization","author":"Research","year":"2022","journal-title":"Open Source Release"},{"key":"ref10","volume-title":"S3FD: Single shot scale-invariant face detector","author":"Zhang","year":"2017"},{"key":"ref11","article-title":"Autoflip: An open source framework for intelligent video reframing","author":"Research","year":"2020","journal-title":"Google"},{"key":"ref12","article-title":"Zoom smart gallery: Creating equitable zoom room experiences","author":"Communications","year":"2022","journal-title":"Zoom Blog"},{"key":"ref13","article-title":"Auto reframe in adobe premiere pro","author":"Systems","year":"2022","journal-title":"Adobe HelpX"},{"key":"ref14","article-title":"Efficient multi-speaker transcription with speaker ids using whisperx","author":"Contributors","year":"2023","journal-title":"Toolify AI"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548027"},{"key":"ref16","volume-title":"Quality-aware end-to-end audio-visual neural speaker diarization","author":"He","year":"2024"}],"event":{"name":"2025 International Conference on Content-Based Multimedia Indexing (CBMI)","location":"Dublin, Ireland","start":{"date-parts":[[2025,10,22]]},"end":{"date-parts":[[2025,10,24]]}},"container-title":["2025 International Conference on Content-Based Multimedia Indexing (CBMI)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11339229\/11339242\/11339302.pdf?arnumber=11339302","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,21]],"date-time":"2026-01-21T07:21:37Z","timestamp":1768980097000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11339302\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,22]]},"references-count":16,"URL":"https:\/\/doi.org\/10.1109\/cbmi66578.2025.11339302","relation":{},"subject":[],"published":{"date-parts":[[2025,10,22]]}}}