{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T17:43:05Z","timestamp":1772905385361,"version":"3.50.1"},"reference-count":129,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"5","license":[{"start":{"date-parts":[[2021,5,1]],"date-time":"2021-05-01T00:00:00Z","timestamp":1619827200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,5,1]],"date-time":"2021-05-01T00:00:00Z","timestamp":1619827200000},"content-version":"am","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,5,1]],"date-time":"2021-05-01T00:00:00Z","timestamp":1619827200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,5,1]],"date-time":"2021-05-01T00:00:00Z","timestamp":1619827200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100006785","name":"research awards from Google","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100006785","id-type":"DOI","asserted-by":"publisher"}]},{"name":"U.S. Chamber of Commerce Foundation"},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Proc. IEEE"],"published-print":{"date-parts":[[2021,5]]},"DOI":"10.1109\/jproc.2020.3047978","type":"journal-article","created":{"date-parts":[[2021,1,13]],"date-time":"2021-01-13T01:13:22Z","timestamp":1610500402000},"page":"891-910","source":"Crossref","is-referenced-by-count":36,"title":["Computational Media Intelligence: Human-Centered Machine Analysis of Media"],"prefix":"10.1109","volume":"109","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2845-1079","authenticated-orcid":false,"given":"Krishna","family":"Somandepalli","sequence":"first","affiliation":[{"name":"Department of Electrical Engineering, University of Southern California, Los Angeles, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2167-4891","authenticated-orcid":false,"given":"Tanaya","family":"Guha","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Warwick, Coventry, U.K."}]},{"given":"Victor R.","family":"Martinez","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Southern California, Los Angeles, CA, USA"}]},{"given":"Naveen","family":"Kumar","sequence":"additional","affiliation":[{"name":"Disney Research, Glendale, CA, USA"}]},{"given":"Hartwig","family":"Adam","sequence":"additional","affiliation":[{"name":"Google Inc., Mountain View, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1052-6204","authenticated-orcid":false,"given":"Shrikanth","family":"Narayanan","sequence":"additional","affiliation":[{"name":"Department of Electrical Engineering, University of Southern California, Los Angeles, CA, USA"}]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CBMI.2015.7153604"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1145\/3242969.3243026"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2019.8803248"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682532"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CBMI.2019.8877398"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-020-09449-6"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.123"},{"key":"ref36","article-title":"Affect extraction using aural, visual and linguistic features from multimedia documents","author":"malandrakis","year":"2012"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2006.1621452"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2011.5946961"},{"key":"ref28","first-page":"129","article-title":"A theoretical framework to represent narrative structures for visual storytelling","author":"akleman","year":"2015","journal-title":"Proc Bridges Math Music Art Archit Culture"},{"key":"ref27","year":"2020","journal-title":"Star Wars&#x2014;Wikipedia the Free Encyclopedia"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2013.2270402"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2015.7301352"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/2818346.2820778"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1462"},{"key":"ref24","author":"bordwell","year":"2004","journal-title":"Film Art An Introduction"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/MMUL.2006.63"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-2028"},{"key":"ref26","first-page":"721","article-title":"VGGSound: A large-scale audio-visual dataset","author":"vedaldi","year":"2020","journal-title":"Proc Int Conf Acoust Speech Signal Process"},{"key":"ref100","article-title":"Moviescope: Large-scale analysis of movies using multiple modalities","author":"cascante-bonilla","year":"2019","journal-title":"arXiv 1908 03180"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3355390"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1080\/00913367.2013.857620"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2018.00020"},{"key":"ref59","article-title":"Multi-face: Self-supervised multiview adaptation for robust face clustering in videos","author":"somandepalli","year":"2020","journal-title":"arXiv 2008 11289"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.3389\/fpsyg.2020.00208"},{"key":"ref57","article-title":"FairFace: Face attribute dataset for balanced race, gender, and age","author":"k\u00e4rkk\u00e4inen","year":"2019","journal-title":"arXiv 1908 04913"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2019.101027"},{"key":"ref54","first-page":"1","article-title":"The &#x2018;Celeb&#x2019; series: A close analysis of audio-visual elements in 2008 US presidential campaign ads","volume":"4","author":"li","year":"2017","journal-title":"Undergraduate J Humanistic Stud"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/NCVPRIPG.2013.6776225"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46487-9_6"},{"key":"ref40","article-title":"Automatic prediction of emotions induced by movies","author":"baveye","year":"2015"},{"key":"ref4","author":"hale","year":"2019","journal-title":"More Than 500 Hours of Content are Now Being Uploaded to Youtube Every Minute"},{"key":"ref3","author":"clement","year":"2020","journal-title":"Most Used Social Media Platform"},{"key":"ref6","article-title":"Large scale holistic video understanding","author":"diba","year":"2019","journal-title":"arXiv 1904 11451"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2723009"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1207\/S15327825MCS0403_01"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1177\/1745691620927666"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1153"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1016\/j.pragma.2018.09.007"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2018.00058"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472183"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1542\/peds.2008-1465"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1001\/archpedi.160.4.348"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952682"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.2753\/JOA0091-3367370207"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1145\/2988257.2988259"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472192"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.462"},{"key":"ref126","first-page":"10","article-title":"Pororoqa: Cartoon video series dataset for story understanding","volume":"15","author":"kim","year":"2016","journal-title":"Proc NIPS Workshop Large Scale Comput Vis Syst"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_17"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1007\/s13218-017-0505-9"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053900"},{"key":"ref72","article-title":"Crossmodal learning for audio-visual speech event localization","author":"sharma","year":"2020","journal-title":"arXiv 2003 04358"},{"key":"ref129","first-page":"2658","article-title":"Knock! knock! Who is it?&#x2019; Probabilistic person identification in TV-series","author":"tapaswi","year":"2012","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.319"},{"key":"ref128","first-page":"1102","article-title":"The repere corpus: A multimodal corpus for person recognition","author":"giraudel","year":"2012","journal-title":"Proc LREC"},{"key":"ref70","first-page":"187","article-title":"Visual voice activity detection at different speeds","author":"joosten","year":"2013","journal-title":"Proc AVSP"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-020-09449-6"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.3301671"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178374"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2008.2009684"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350867"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2924733"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682532"},{"key":"ref62","author":"staff","year":"2019","journal-title":"Improvement Toward Inclusion in Film But More Work to be Done"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1386"},{"key":"ref63","author":"ryzik","year":"0","journal-title":"How Long Is An Actress Onscreen? A New Tool Finds the Answer Faster"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-540"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2019.8803248"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2008.4587756"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_41"},{"key":"ref2","author":"simpson","year":"2017","journal-title":"Council Post Finding Brand Success in the Digital World"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2798607"},{"key":"ref1","author":"watson","year":"2020","journal-title":"Topic Movie Industry"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.447"},{"key":"ref95","article-title":"Condensed movies: Story based retrieval with contextual embeddings","author":"bain","year":"2020","journal-title":"arXiv 2005 04208"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24947-6_17"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00633"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2745712"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.778"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682314"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_26"},{"key":"ref91","year":"2018","journal-title":"The Geena Davis Institute on Gender in Media and J Walter Thompson Present Revealing Findings About Women&#x2019;s Representation in Advertising at Cannes Lions"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00236"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1093\/acprof:oso\/9780195372076.001.0001"},{"key":"ref103","article-title":"From trailers to storylines: An efficient way to learn from movies","author":"huang","year":"2018","journal-title":"arXiv 1806 05341"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00895"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_27"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1145\/1873951.1874068"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0987-1"},{"key":"ref98","first-page":"17","article-title":"A unified framework for shot type classification based on subject centric lens","author":"rao","year":"2020","journal-title":"Proc Eur Conf Comput Vis (ECCV)"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00469"},{"key":"ref96","first-page":"10146","article-title":"A local-to-global approach to multi-modal movie scene segmentation","author":"rao","year":"2020","journal-title":"Proc IEEE\/CVF Conf Comput Vis Pattern Recognit (CVPR)"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58520-4_9"},{"key":"ref10","first-page":"11","article-title":"Theoretical foundations of identity","author":"hammack","year":"2015","journal-title":"The Oxford Handbook of Identity Development"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1348\/014466607X187037"},{"key":"ref12","doi-asserted-by":"crossref","first-page":"3","DOI":"10.1207\/s1532706xid0601_2","article-title":"Identity studies: How close are we to developing a social science of identity?&#x2014;An appraisal of the field","volume":"6","author":"c\u00f4t\u00e9","year":"2006","journal-title":"Identities"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1016\/j.sbspro.2015.02.244"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1177\/0146167293191001"},{"key":"ref15","article-title":"Personal dimensions of identity model","author":"arredondo","year":"1992","journal-title":"Proc Boston Empowerment Workshops"},{"key":"ref118","article-title":"DramaQA: Character-centered video story understanding with hierarchical QA","author":"choi","year":"2020","journal-title":"arXiv 2005 03356"},{"key":"ref16","first-page":"572","article-title":"SEXNET: A neural network identifies sex from human faces","author":"golomb","year":"1991","journal-title":"Advances in Neural Information Processing Systems 3"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1247"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6637694"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2007.70800"},{"key":"ref81","article-title":"Victim or perpetrator? Analysis of violent characters portrayals from movie scripts","author":"martinez","year":"2020","journal-title":"arXiv 2008 08225"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2014.2321570"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1145\/3287560.3287589"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.01024"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1006\/cviu.1997.0549"},{"key":"ref83","article-title":"A survey on bias and fairness in machine learning","author":"mehrabi","year":"2019","journal-title":"arXiv 1908 09635"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2017.08.029"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.2016.7727207"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.283"},{"key":"ref80","first-page":"317","article-title":"The MediaEval 2011 affect task: Violent scenes detection in hollywood movies","author":"demarty","year":"2011","journal-title":"Proc Mediaeval Workshop"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1145\/2671188.2749296"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1167"},{"key":"ref89","year":"2020","journal-title":"Diversity and Inclusivity Report Gender in Youtube Advertising&#x2014;Think With Google"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.730"},{"key":"ref122","article-title":"MELD: A multimodal multi-party dataset for emotion recognition in conversations","author":"poria","year":"2018","journal-title":"arXiv 1810 02508"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.5244\/C.31.107"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1145\/3287560.3287598"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1145\/3359246"},{"key":"ref87","article-title":"Measuring and reducing gendered correlations in pre-trained models","author":"webster","year":"2020","journal-title":"arXiv 2010 06032"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-1202"}],"container-title":["Proceedings of the IEEE"],"original-title":[],"link":[{"URL":"https:\/\/ieeexplore.ieee.org\/ielam\/5\/9420072\/9319168-aam.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/5\/9420072\/09319168.pdf?arnumber=9319168","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,6,16]],"date-time":"2022-06-16T19:32:09Z","timestamp":1655407929000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9319168\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,5]]},"references-count":129,"journal-issue":{"issue":"5"},"URL":"https:\/\/doi.org\/10.1109\/jproc.2020.3047978","relation":{},"ISSN":["0018-9219","1558-2256"],"issn-type":[{"value":"0018-9219","type":"print"},{"value":"1558-2256","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,5]]}}}