{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T10:21:39Z","timestamp":1763202099996,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","license":[{"start":{"date-parts":[[2017,10,23]],"date-time":"2017-10-23T00:00:00Z","timestamp":1508716800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key Research and Development Plan","award":["2016YFB1001202"],"award-info":[{"award-number":["2016YFB1001202"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2017,10,23]]},"DOI":"10.1145\/3123266.3123420","type":"proceedings-article","created":{"date-parts":[[2017,10,20]],"date-time":"2017-10-20T13:04:26Z","timestamp":1508504666000},"page":"1838-1846","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":54,"title":["Video Captioning with Guidance of Multimodal Latent Topics"],"prefix":"10.1145","author":[{"given":"Shizhe","family":"Chen","sequence":"first","affiliation":[{"name":"Renmin University of China, Beijing, China"}]},{"given":"Jia","family":"Chen","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"given":"Qin","family":"Jin","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, China"}]},{"given":"Alexander","family":"Hauptmann","sequence":"additional","affiliation":[{"name":"Carnegie Mellen University, Pittsburgh, PA, USA"}]}],"member":"320","published-online":{"date-parts":[[2017,10,23]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Jimmy Ba and Rich Caruana. 2014. Do deep nets really need to be deep?. In NIPS. 2654--2662. Jimmy Ba and Rich Caruana. 2014. Do deep nets really need to be deep?. In NIPS. 2654--2662."},{"volume-title":"Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473","year":"2014","author":"Bahdanau Dzmitry","key":"e_1_3_2_1_2_1"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.5555\/944919.944937"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3078971.3079000"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1980.1163420"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/1014052.1014118"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/2185520.2185597"},{"volume-title":"Improving Interpretability of Deep Neural Networks with Semantic Information. arXiv preprint arXiv:1703.04096","year":"2017","author":"Dong Yinpeng","key":"e_1_3_2_1_8_1"},{"volume-title":"Semantic Compositional Networks for Visual Captioning. arXiv preprint arXiv:1611.08002","year":"2016","author":"Gan Zhe","key":"e_1_3_2_1_9_1"},{"key":"e_1_3_2_1_10_1","volume-title":"Aistats","volume":"15","author":"Glorot Xavier","year":"2011"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.337"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2984065"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/2911996.2912043"},{"volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","year":"2014","author":"Kingma Diederik","key":"e_1_3_2_1_15_1"},{"key":"e_1_3_2_1_16_1","unstructured":"Alex Krizhevsky Geoffrey E Hinton and others. 2010. Factored 3-way restricted boltzmann machines for modeling natural images International conference on artificial intelligence and statistics. 621--628. Alex Krizhevsky Geoffrey E Hinton and others. 2010. Factored 3-way restricted boltzmann machines for modeling natural images International conference on artificial intelligence and statistics. 621--628."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995466"},{"volume-title":"Meteor universal: Language specific translation evaluation for any target language. ACL","year":"2014","author":"Alon Lavie Michael Denkowski","key":"e_1_3_2_1_18_1"},{"key":"e_1_3_2_1_19_1","unstructured":"R\u00e9mi Lebret Pedro H. O. Pinheiro and Ronan Collobert. 2015. Phrase-based Image Captioning. In ICML. 2085--2094. R\u00e9mi Lebret Pedro H. O. Pinheiro and Ronan Collobert. 2015. Phrase-based Image Captioning. In ICML. 2085--2094."},{"volume-title":"Michael Cogswell, Viresh Ranjan, David Crandall, and Dhruv Batra.","year":"2016","author":"Lee Stefan","key":"e_1_3_2_1_20_1"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2964287"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Yuncheng Li Yale Song Liangliang Cao Joel Tetreault Larry Goldberg Alejandro Jaimes and Jiebo Luo. 2016 b. Tgif: A new dataset and benchmark on animated gif description CVPR. 4641--4650. Yuncheng Li Yale Song Liangliang Cao Joel Tetreault Larry Goldberg Alejandro Jaimes and Jiebo Luo. 2016 b. Tgif: A new dataset and benchmark on animated gif description CVPR. 4641--4650.","DOI":"10.1109\/CVPR.2016.502"},{"volume-title":"Rouge: A package for automatic evaluation of summaries Text summarization branches out: Proceedings of the ACL-04 workshop","year":"2004","author":"Lin Chin-Yew","key":"e_1_3_2_1_23_1"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"R Memisevic and G Hinton. 2007. Unsupervised Learning of Image Transformations. In CVPR. 1--8. R Memisevic and G Hinton. 2007. Unsupervised Learning of Image Transformations. In CVPR. 1--8.","DOI":"10.1109\/CVPR.2007.383036"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Pingbo Pan Zhongwen Xu Yi Yang Fei Wu and Yueting Zhuang. 2016 b. Hierarchical recurrent neural encoder for video representation with application to captioning CVPR. 1029--1038. Pingbo Pan Zhongwen Xu Yi Yang Fei Wu and Yueting Zhuang. 2016 b. Hierarchical recurrent neural encoder for video representation with application to captioning CVPR. 1029--1038.","DOI":"10.1109\/CVPR.2016.117"},{"key":"e_1_3_2_1_26_1","unstructured":"Yingwei Pan Tao Mei Ting Yao Houqiang Li and Yong Rui. 2016 a. Jointly Modeling Embedding and Translation to Bridge Video and Language CVPR. 4594--4602. Yingwei Pan Tao Mei Ting Yao Houqiang Li and Yong Rui. 2016 a. Jointly Modeling Embedding and Translation to Bridge Video and Language CVPR. 4594--4602."},{"volume-title":"Softening quantization in bag-of-audio-words","author":"Pancoast Stephanie","key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","DOI":"10.1109\/ICASSP.2014.6853821"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2984066"},{"volume-title":"Self-critical Sequence Training for Image Captioning. arXiv preprint arXiv:1612.00563","year":"2016","author":"Rennie Steven J","key":"e_1_3_2_1_30_1"},{"key":"e_1_3_2_1_31_1","unstructured":"Microsoft Research. 2016. ACM Multimedia MSR video to language challenge. http:\/\/www.acmmm.org\/2016\/wp-content\/uploads\/2016\/04\/ACMMM16_GC_MSR_Video_to_Language_Updated.pdf (2016). Microsoft Research. 2016. ACM Multimedia MSR video to language challenge. http:\/\/www.acmmm.org\/2016\/wp-content\/uploads\/2016\/04\/ACMMM16_GC_MSR_Video_to_Language_Updated.pdf (2016)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-013-0636-x"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Zhiqiang Shen Jianguo Li Zhou Su Minjun Li Yurong Chen Yu-Gang Jiang and Xiangyang Xue. 2017. Weakly Supervised Dense Video Captioning. In CVPR. Zhiqiang Shen Jianguo Li Zhou Su Minjun Li Yurong Chen Yu-Gang Jiang and Xiangyang Xue. 2017. Weakly Supervised Dense Video Captioning. In CVPR.","DOI":"10.1109\/CVPR.2017.548"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2984062"},{"key":"e_1_3_2_1_35_1","unstructured":"Ilya Sutskever Oriol Vinyals and Quoc V Le. 2014. Sequence to sequence learning with neural networks NIPS. 3104--3112. Ilya Sutskever Oriol Vinyals and Quoc V Le. 2014. Sequence to sequence learning with neural networks NIPS. 3104--3112."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Christian Szegedy Sergey Ioffe Vincent Vanhoucke and Alex Alemi. 2016. Inception-v4 Inception-ResNet and the Impact of Residual Connections on Learning. (2016). Christian Szegedy Sergey Ioffe Vincent Vanhoucke and Alex Alemi. 2016. Inception-v4 Inception-ResNet and the Impact of Residual Connections on Learning. (2016).","DOI":"10.1609\/aaai.v31i1.11231"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"volume-title":"Cider: Consensus-based image description evaluation CVPR. 4566--4575.","year":"2015","author":"Vedantam Ramakrishna","key":"e_1_3_2_1_38_1"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.515"},{"volume-title":"Translating Videos to Natural Language Using Deep Recurrent Neural Networks. Computer Science","year":"2014","author":"Venugopalan Subhashini","key":"e_1_3_2_1_40_1"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Oriol Vinyals Alexander Toshev Samy Bengio and Dumitru Erhan. 2015. Show and tell: A neural image caption generator. CVPR. 3156--3164. Oriol Vinyals Alexander Toshev Samy Bengio and Dumitru Erhan. 2015. Show and tell: A neural image caption generator. CVPR. 3156--3164.","DOI":"10.1109\/CVPR.2015.7298935"},{"volume-title":"Msr-vtt: A large video description dataset for bridging video and language CVPR. 5288--5296.","year":"2016","author":"Xu Jun","key":"e_1_3_2_1_42_1"},{"volume-title":"attend and tell: Neural image caption generation with visual attention. arXiv:1502.03044","year":"2015","author":"Xu Kelvin","key":"e_1_3_2_1_43_1"},{"key":"e_1_3_2_1_44_1","unstructured":"Zhilin Yang Ye Yuan Yuexin Wu William W Cohen and Ruslan R Salakhutdinov. 2016. Review networks for caption generation. In NIPS. 2361--2369. Zhilin Yang Ye Yuan Yuexin Wu William W Cohen and Ruslan R Salakhutdinov. 2016. Review networks for caption generation. In NIPS. 2361--2369."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.512"},{"volume-title":"Image captioning with semantic attention. arXiv:1603.03925","year":"2016","author":"You Quanzeng","key":"e_1_3_2_1_46_1"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Haonan Yu Jiang Wang Zhiheng Huang Yi Yang and Wei Xu. 2016. Video Paragraph Captioning Using Hierarchical Recurrent Neural Networks CVPR. 4584--4593. Haonan Yu Jiang Wang Zhiheng Huang Yi Yang and Wei Xu. 2016. Video Paragraph Captioning Using Hierarchical Recurrent Neural Networks CVPR. 4584--4593.","DOI":"10.1109\/CVPR.2016.496"}],"event":{"name":"MM '17: ACM Multimedia Conference","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Mountain View California USA","acronym":"MM '17"},"container-title":["Proceedings of the 25th ACM international conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3123266.3123420","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3123266.3123420","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,26]],"date-time":"2025-06-26T16:45:59Z","timestamp":1750956359000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3123266.3123420"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,10,23]]},"references-count":47,"alternative-id":["10.1145\/3123266.3123420","10.1145\/3123266"],"URL":"https:\/\/doi.org\/10.1145\/3123266.3123420","relation":{},"subject":[],"published":{"date-parts":[[2017,10,23]]},"assertion":[{"value":"2017-10-23","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}