{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,8]],"date-time":"2026-02-08T04:21:16Z","timestamp":1770524476404,"version":"3.49.0"},"publisher-location":"New York, New York, USA","reference-count":39,"publisher":"ACM Press","license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1145\/3184558.3186584","type":"proceedings-article","created":{"date-parts":[[2018,4,18]],"date-time":"2018-04-18T18:04:25Z","timestamp":1524074665000},"page":"671-678","source":"Crossref","is-referenced-by-count":11,"title":["Fine-grained Video Attractiveness Prediction Using Multimodal Deep Learning on a Large Real-world Dataset"],"prefix":"10.1145","author":[{"given":"Xinpeng","family":"Chen","sequence":"first","affiliation":[{"name":"Wuhan University, Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jingyuan","family":"Chen","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lin","family":"Ma","sequence":"additional","affiliation":[{"name":"Tencent AI Lab, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jian","family":"Yao","sequence":"additional","affiliation":[{"name":"Wuhan University, Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wei","family":"Liu","sequence":"additional","affiliation":[{"name":"Tencent AI Lab, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiebo","family":"Luo","sequence":"additional","affiliation":[{"name":"University of Rochester, Rochester, NY, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tong","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tencent AI Lab, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","reference":[{"key":"key-10.1145\/3184558.3186584-1","unstructured":"Sami Abu-El-Haija, Nisarg Kothari, Joonseok Lee, Apostol (Paul) Natsev, George Toderici, Balakrishnan Varadarajan, and Sudheendra Vijayanarasimhan. 2016. YouTube-8M: A Large-Scale Video Classification Benchmark. In arXiv:1609.08675."},{"key":"key-10.1145\/3184558.3186584-2","doi-asserted-by":"crossref","unstructured":"Moshe Blank, Lena Gorelick, Eli Shechtman, Michal Irani, and Ronen Basri. 2005. Actions as Space-Time Shapes. In ICCV.","DOI":"10.1109\/ICCV.2005.28"},{"key":"key-10.1145\/3184558.3186584-3","doi-asserted-by":"crossref","unstructured":"Jingyuan Chen, Xuemeng Song, Liqiang Nie, Xiang Wang, Hanwang Zhang, and Tat-Seng Chua. 2016. Micro Tells Macro: Predicting the Popularity of MicroVideos via a Transductive Model. In ACM Multimedia.","DOI":"10.1145\/2964284.2964314"},{"key":"key-10.1145\/3184558.3186584-4","doi-asserted-by":"crossref","unstructured":"Jingyuan Chen, Hanwang Zhang, Xiangnan He, Liqiang Nie, Wei Liu, and TatSeng Chua. 2017. Attentive Collaborative Filtering: Multimedia Recommendation with Item and Component-Level Attention. In SIGIR.","DOI":"10.1145\/3077136.3080797"},{"key":"key-10.1145\/3184558.3186584-5","doi-asserted-by":"crossref","unstructured":"Steven Davis and Paul Mermelstein. 1980. Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences. IEEE transactions on acoustics, speech, and signal processing (1980).","DOI":"10.1109\/TASSP.1980.1163420"},{"key":"key-10.1145\/3184558.3186584-6","unstructured":"Jesse Engel, Cinjon Resnick, Adam Roberts, Sander Dieleman, Douglas Eck, Karen Simonyan, and Mohammad Norouzi. 2017. Neural Audio Synthesis of Musical Notes with WaveNet Autoencoders. arXiv preprint arXiv:1704.01279 (2017)."},{"key":"key-10.1145\/3184558.3186584-7","doi-asserted-by":"crossref","unstructured":"John N. Gowdy and Zekeriya Tufekci. 2000. Mel-scaled discrete wavelet coefficients for speech recognition. In ICASSP.","DOI":"10.1109\/ICASSP.2000.861829"},{"key":"key-10.1145\/3184558.3186584-8","unstructured":"Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2015. Deep Residual Learning for Image Recognition. In CVPR."},{"key":"key-10.1145\/3184558.3186584-9","doi-asserted-by":"crossref","unstructured":"Fabian Caba Heilbron, Victor Escorcia, Bernard Ghanem, and Juan Carlos Niebles. 2015. ActivityNet: A Large-Scale Video Benchmark for Human Activity Understanding. In CVPR.","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"key-10.1145\/3184558.3186584-10","unstructured":"Sergey Ioffe and Christian Szegedy. 2015. Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift. In ICML."},{"key":"key-10.1145\/3184558.3186584-11","doi-asserted-by":"crossref","unstructured":"Yugang Jiang, Qi Dai, Xiangyang Xue, Wei Liu, and Chong-Wah Ngo. 2012. Trajectory-based modeling of human actions with motion reference points. In ECCV.","DOI":"10.1007\/978-3-642-33715-4_31"},{"key":"key-10.1145\/3184558.3186584-12","unstructured":"Yugang Jiang, Jingen Liu, Amir Roshan Zamir, Ivan Laptev, Massimo Piccardi, Mubarak Shah, and Rahul Sukthankar. 2013. THUMOS Challenge: Action Recognition with a Large Number of Classes. http:\/\/crcv.ucf.edu\/ ICCV13-Action-Workshop\/. (2013)."},{"key":"key-10.1145\/3184558.3186584-13","doi-asserted-by":"crossref","unstructured":"Yugang Jiang, Yanran Wang, Rui Feng, Xiangyang Xue, Yingbin Zheng, and Hanfang Yang. 2013. Understanding and Predicting Interestingness of Videos. In AAAI.","DOI":"10.1609\/aaai.v27i1.8457"},{"key":"key-10.1145\/3184558.3186584-14","unstructured":"Yugang Jiang, Zuxuan Wu, Jun Wang, Xiangyang Xue, and Shih-Fu Chang. 2015. Exploiting Feature and Class Relationships in Video Categorization with Regularized Deep Neural Networks. arXiv preprint arXiv:1502.07209 (2015)."},{"key":"key-10.1145\/3184558.3186584-15","doi-asserted-by":"crossref","unstructured":"Andrej Karpathy, George Toderici, Sanketh Shetty, Thomas Leung, Rahul Sukthankar, and Li Fei-Fei. 2014. Large-scale Video Classification with Convolutional Neural Networks. In CVPR.","DOI":"10.1109\/CVPR.2014.223"},{"key":"key-10.1145\/3184558.3186584-16","doi-asserted-by":"crossref","unstructured":"Aditya Khosla, Atish Das Sarma, and Raffay Hamid. 2014. What makes an image popular. In WWW.","DOI":"10.1145\/2566486.2567996"},{"key":"key-10.1145\/3184558.3186584-17","doi-asserted-by":"crossref","unstructured":"H. Kuehne, H. Jhuang, E. Garrote, T. Poggio, and T. Serre. 2011. HMDB: a large video database for human motion recognition. In ICCV.","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"key-10.1145\/3184558.3186584-18","unstructured":"Laptev and Ivan. 2005. On space-time interest points. International journal of computer vision 64, 2--3 (2005), 107--123."},{"key":"key-10.1145\/3184558.3186584-19","doi-asserted-by":"crossref","unstructured":"Yuncheng Li, Yale Song, Liangliang Cao, Joel Tetreault, Larry Goldberg, Alejandro Jaimes, and Jiebo Luo. 2016. TGIF: A new dataset and benchmark on animated GIF description. In CVPR.","DOI":"10.1109\/CVPR.2016.502"},{"key":"key-10.1145\/3184558.3186584-20","unstructured":"Feng Liu, Yuzhen Niu, and Michael Gleicher. 2009. Using Web Photos for Measuring Video Frame Interestingness. In IJCAI."},{"key":"key-10.1145\/3184558.3186584-21","doi-asserted-by":"crossref","unstructured":"Wei Liu, Zhifeng Li, and Xiaoou Tang. 2006. Spatio-temporal embedding for statistical face recognition from video. In ECCV.","DOI":"10.1007\/11744047_29"},{"key":"key-10.1145\/3184558.3186584-22","doi-asserted-by":"crossref","unstructured":"Marcin Marszaek, Ivan Laptev, and Cordelia Schmid. 2009. Actions in Context. In CVPR.","DOI":"10.1109\/CVPR.2009.5206557"},{"key":"key-10.1145\/3184558.3186584-23","unstructured":"K Messer, J Matas, J Kittler, J Luettin, and G Maitre. 1999. XM2VTSDB: The Extended M2VTS Database. In Second International Conference on Audio and Video-based Biometric Person Authentication."},{"key":"key-10.1145\/3184558.3186584-24","unstructured":"Antoine Miech, Ivan Laptev, and Josef Sivic. 2017. Learnable pooling with Context Gating for video classification. arXiv preprint arXiv:1706.06905 (2017)."},{"key":"key-10.1145\/3184558.3186584-25","unstructured":"Aaron van den Oord, Sander Dieleman, Heiga Zen, Karen Simonyan, Oriol Vinyals, Alex Graves, Nal Kalchbrenner, Andrew Senior, and Koray Kavukcuoglu. 2016. Wavenet: A generative model for raw audio. arXiv preprint arXiv:1609.03499 (2016)."},{"key":"key-10.1145\/3184558.3186584-26","doi-asserted-by":"crossref","unstructured":"Henrique Pinto, Jussara M. Almeida, and Marcos A. Gon&#231;alves. 2013. Using Early View Patterns to Predict the Popularity of Youtube Videos. In WSDM.","DOI":"10.1145\/2433396.2433443"},{"key":"key-10.1145\/3184558.3186584-27","unstructured":"Marian-Andrei Rizoiu, Lexing Xie, Scott Sanner, Manuel Cebrian, Honglin Yu, and Pascal Van Hentenryck. 2017. Expecting to Be HIP: Hawkes Intensity Processes for Social Media Popularity. In WWW."},{"key":"key-10.1145\/3184558.3186584-28","unstructured":"Yuesong Shen, Claire-H&#233;l&#232;ne Demarty, and Ngoc Q. K. Duong. 2016. Technicolor@MediaEval 2016 Predicting Media Interestingness Task. In MediaEval."},{"key":"key-10.1145\/3184558.3186584-29","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very Deep Convolutional Networks for Large-Scale Image Recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"key-10.1145\/3184558.3186584-30","unstructured":"Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah. 2012. UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild. In arXiv:1212.0402."},{"key":"key-10.1145\/3184558.3186584-31","doi-asserted-by":"crossref","unstructured":"G&#225;bor Szab&#243; and Bernardo A. Huberman. 2010. Predicting the popularity of online content. Commun. ACM 53, 8 (2010), 80--88.","DOI":"10.1145\/1787234.1787254"},{"key":"key-10.1145\/3184558.3186584-32","doi-asserted-by":"crossref","unstructured":"Christian Szegedy, Sergey Ioffe, Vincent Vanhoucke, and Alex A. Alemi. 2016. Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning. In ICLR Workshop.","DOI":"10.1609\/aaai.v31i1.11231"},{"key":"key-10.1145\/3184558.3186584-33","doi-asserted-by":"crossref","unstructured":"Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, and Andrew Rabinovich. 2015. Going Deeper with Convolutions. In CVPR.","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"key-10.1145\/3184558.3186584-34","doi-asserted-by":"crossref","unstructured":"Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens, and Zbigniew Wojna. 2016. Rethinking the Inception Architecture for Computer Vision. In CVPR.","DOI":"10.1109\/CVPR.2016.308"},{"key":"key-10.1145\/3184558.3186584-35","doi-asserted-by":"crossref","unstructured":"Bart Thomee, David A. Shamma, Gerald Friedland, Benjamin Elizalde, Karl Ni, Douglas Poland, Damian Borth, and Li-Jia Li. 2016. YFCC100M: The New Data in Multimedia Research. Commun. ACM 59, 2 (Jan. 2016), 64--73.","DOI":"10.1145\/2812802"},{"key":"key-10.1145\/3184558.3186584-36","doi-asserted-by":"crossref","unstructured":"George Tzanetakis and Perry Cook. 2002. Musical genre classification of audio signals. IEEE Transactions on speech and audio processing 10, 5 (2002), 293--302.","DOI":"10.1109\/TSA.2002.800560"},{"key":"key-10.1145\/3184558.3186584-37","doi-asserted-by":"crossref","unstructured":"Bin Wu, Erheng Zhong, Ben Tan, Andrew Horner, and Qiang Yang. 2014. Crowdsourced time-sync video tagging using temporal and personalized topic modeling. In SIGKDD.","DOI":"10.1145\/2623330.2623625"},{"key":"key-10.1145\/3184558.3186584-38","doi-asserted-by":"crossref","unstructured":"Sejong Yoon and Vladimir Pavlovic. 2014. Sentiment Flow for Video Interestingness Prediction. In Proceedings of the 1st ACM International Workshop on Human Centered Event Understanding from Multimedia (HuEvent '14).","DOI":"10.1145\/2660505.2660513"},{"key":"key-10.1145\/3184558.3186584-39","doi-asserted-by":"crossref","unstructured":"Barret Zoph, Vijay Vasudevan, Jonathon Shlens, and Quoc V. Le. 2017. Learning transferable architectures for scalable image recognition. arXiv preprint arXiv:1707.07012 (2017).","DOI":"10.1109\/CVPR.2018.00907"}],"event":{"name":"Companion of the The Web Conference 2018","location":"Lyon, France","acronym":"WWW '18","number":"2018","sponsor":["IW3C2, International World Wide Web Conference Committee","SIGWEB, ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"start":{"date-parts":[[2018,4,23]]},"end":{"date-parts":[[2018,4,27]]}},"container-title":["Companion of the The Web Conference 2018 on The Web Conference 2018 - WWW '18"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3184558.3186584","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/dl.acm.org\/ft_gateway.cfm?id=3186584&ftid=1958233&dwn=1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T02:26:07Z","timestamp":1750213567000},"score":1,"resource":{"primary":{"URL":"http:\/\/dl.acm.org\/citation.cfm?doid=3184558.3186584"}},"subtitle":[],"proceedings-subject":"The Web Conference 2018","short-title":[],"issued":{"date-parts":[[2018]]},"references-count":39,"URL":"https:\/\/doi.org\/10.1145\/3184558.3186584","relation":{},"subject":[],"published":{"date-parts":[[2018]]}}}