{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:31:29Z","timestamp":1750221089979,"version":"3.41.0"},"publisher-location":"New York, New York, USA","reference-count":27,"publisher":"ACM Press","license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1145\/3287921.3287955","type":"proceedings-article","created":{"date-parts":[[2018,12,13]],"date-time":"2018-12-13T15:45:16Z","timestamp":1544715916000},"page":"367-374","source":"Crossref","is-referenced-by-count":4,"title":["Personal Diary Generation from Wearable Cameras with Concept Augmented Image Captioning and Wide Trail Strategy"],"prefix":"10.1145","author":[{"given":"Viet-Khoa","family":"Vo-Ho","sequence":"first","affiliation":[{"name":"Software Engineering Lab, University of Science, VNU-HCM"}]},{"given":"Quoc-An","family":"Luong","sequence":"additional","affiliation":[{"name":"Software Engineering Lab, University of Science, VNU-HCM"}]},{"given":"Duy-Tam","family":"Nguyen","sequence":"additional","affiliation":[{"name":"Software Engineering Lab, University of Science, VNU-HCM"}]},{"given":"Mai-Khiem","family":"Tran","sequence":"additional","affiliation":[{"name":"Software Engineering Lab, University of Science, VNU-HCM"}]},{"given":"Minh-Triet","family":"Tran","sequence":"additional","affiliation":[{"name":"Software Engineering Lab, University of Science, VNU-HCM"}]}],"member":"320","reference":[{"key":"key-10.1145\/3287921.3287955-1","unstructured":"Levent Bayindir. 2017. A survey of people-centric sensing studies utilizing mobile phone sensors. 9 (06 2017), 421--448."},{"key":"key-10.1145\/3287921.3287955-2","unstructured":"Duc-Tien Dang-Nguyen, Luca Piras, Michael Riegler, Liting Zhou, Mathias Lux, and Cathal Gurrin. 2018. Overview of ImageCLEFlifelog 2018: Daily Living Understanding and Lifelog Moment Retrieval. In CLEF2018 Working Notes (CEUR Workshop Proceedings). CEUR-WS.org &lt;http:\/\/ceur-ws.org&gt;, Avignon, France."},{"key":"key-10.1145\/3287921.3287955-3","unstructured":"Jacob Devlin, Saurabh Gupta, Ross B. Girshick, Margaret Mitchell, and C. Lawrence Zitnick. 2015. Exploring Nearest Neighbor Approaches for Image Captioning. CoRR abs\/1505.04467 (2015). arXiv:1505.04467 http:\/\/arxiv.org\/abs\/1505.04467"},{"key":"key-10.1145\/3287921.3287955-4","doi-asserted-by":"crossref","unstructured":"Ali Farhadi, Mohsen Hejrati, Mohammad Amin Sadeghi, Peter Young, Cyrus Rashtchian, Julia Hockenmaier, and David Forsyth. 2010. Every Picture Tells a Story: Generating Sentences from Images. In Proceedings of the 11th European Conference on Computer Vision: Part IV (ECCV'10). Springer-Verlag, Berlin, Heidelberg, 15--29. http:\/\/dl.acm.org\/citation.cfm?id=1888089.1888092","DOI":"10.1007\/978-3-642-15561-1_2"},{"key":"key-10.1145\/3287921.3287955-5","doi-asserted-by":"crossref","unstructured":"Cathal Gurrin, Alan F. Smeaton, and Aiden R. Doherty. 2014. LifeLogging: Personal Big Data. Found. Trends Inf. Retr. 8, 1 (June 2014), 1--125.","DOI":"10.1561\/1500000033"},{"key":"key-10.1145\/3287921.3287955-6","doi-asserted-by":"crossref","unstructured":"Kaiming He, Georgia Gkioxari, Piotr Dollar, and Ross Girshick. 2017. Mask R-CNN. In The IEEE International Conference on Computer Vision (ICCV).","DOI":"10.1109\/ICCV.2017.322"},{"key":"key-10.1145\/3287921.3287955-7","doi-asserted-by":"crossref","unstructured":"Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2016. Deep Residual Learning for Image Recognition. In The IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2016.90"},{"key":"key-10.1145\/3287921.3287955-8","doi-asserted-by":"crossref","unstructured":"Sepp Hochreiter and J&#252;rgen Schmidhuber. 1997. Long Short-Term Memory. Neural Comput. 9, 8 (Nov. 1997), 1735--1780.","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"key-10.1145\/3287921.3287955-9","unstructured":"Po-Sen Huang, Chong Wang, Dengyong Zhou, and Li Deng. 2017. Neural Phrase- based Machine Translation. CoRR abs\/1706.05565 (2017). arXiv:1706.05565 http:\/\/arxiv.org\/abs\/1706.05565"},{"key":"key-10.1145\/3287921.3287955-10","doi-asserted-by":"crossref","unstructured":"Andrej Karpathy and Li Fei-Fei. 2015. Deep Visual-Semantic Alignments for Generating Image Descriptions. In The IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"key-10.1145\/3287921.3287955-11","doi-asserted-by":"crossref","unstructured":"G. Kulkarni, V. Premraj, S. Dhar, Siming Li, Yejin Choi, A. C. Berg, and T. L. Berg. 2011. Baby Talk: Understanding and Generating Simple Image Descriptions. In Proceedings of the 2011 IEEE Conference on Computer Vision and Pattern Recognition (CVPR '11). IEEE Computer Society, Washington, DC, USA, 1601--1608. https:\/\/doi.org\/10.1109\/CVPR.2011.5995466","DOI":"10.1109\/CVPR.2011.5995466"},{"key":"key-10.1145\/3287921.3287955-12","unstructured":"Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll&#225;r, and C. Lawrence Zitnick. 2014. Microsoft COCO: Common Objects in Context. CoRR abs\/1405.0312 (2014). arXiv:1405.0312 http:\/\/arxiv.org\/abs\/1405.0312"},{"key":"key-10.1145\/3287921.3287955-13","unstructured":"Minh-Thang Luong, Hieu Pham, and Christopher D. Manning. 2015. Effective Approaches to Attention-based Neural Machine Translation. CoRR abs\/1508.04025 (2015). arXiv:1508.04025 http:\/\/arxiv.org\/abs\/1508.04025"},{"key":"key-10.1145\/3287921.3287955-14","unstructured":"Tomas Mikolov, Kai Chen, Gregory S. Corrado, and Jeffrey Dean. 2013. Efficient Estimation of Word Representations in Vector Space. CoRR abs\/1301.3781 (2013)."},{"key":"key-10.1145\/3287921.3287955-15","unstructured":"Tran Minh-Triet, Truong Thanh-Dat, Dinh-Duy Tung, Vo-Ho Viet-Khoa, Luong Quoc-An, and Nguyen Vinh-Tiep. 2018. Lifelog Moment Retrieval with Visual Concept Fusion and Text-based Query Expansion. In CLEF2018 Working Notes (CEUR Workshop Proceedings). CEUR-WS.org &lt;http:\/\/ceur-ws.org&gt;, Avignon, France."},{"key":"key-10.1145\/3287921.3287955-16","unstructured":"Jonghwan Mun, Minsu Cho, and Bohyung Han. 2016. Text-guided Attention Model for Image Captioning. CoRR abs\/1612.03557 (2016). arXiv:1612.03557 http:\/\/arxiv.org\/abs\/1612.03557"},{"key":"key-10.1145\/3287921.3287955-17","doi-asserted-by":"crossref","unstructured":"Vinh-Tiep Nguyen, Khanh-Duy Le, Minh-Triet Tran, and Morten Fjeld. 2016. NowAndThen: A Social Network-based Photo Recommendation Tool Supporting Reminiscence. In Proceedings of the 15th International Conference on Mobile and Ubiquitous Multimedia (MUM '16). ACM, New York, NY, USA, 159--168.","DOI":"10.1145\/3012709.3012738"},{"key":"key-10.1145\/3287921.3287955-18","doi-asserted-by":"crossref","unstructured":"Reza Rawassizadeh, Martin Tomitsch, Katarzyna Wac, and A. Min Tjoa. 2013. UbiqLog: a generic mobile phone-based lifelog framework. Personal and Ubiquitous Computing 17, 4 (01 Apr 2013), 621--637.","DOI":"10.1007\/s00779-012-0511-8"},{"key":"key-10.1145\/3287921.3287955-19","unstructured":"Steven J. Rennie, Etienne Marcheret, Youssef Mroueh, Jarret Ross, and Vaibhava Goel. 2016. Self-critical Sequence Training for Image Captioning. CoRR abs\/1612.00563 (2016). arXiv:1612.00563 http:\/\/arxiv.org\/abs\/1612.00563"},{"key":"key-10.1145\/3287921.3287955-20","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very Deep Convolutional Networks for Large-Scale Image Recognition. CoRR abs\/1409.1556 (2014). arXiv:1409.1556 http:\/\/arxiv.org\/abs\/1409.1556"},{"key":"key-10.1145\/3287921.3287955-21","doi-asserted-by":"crossref","unstructured":"Thanh-Dat Truong, Tung Dinh-Duy, Vinh-Tiep Nguyen, and Minh-Triet Tran. 2018. Lifelogging Retrieval based on Semantic Concepts Fusion. In Lifelogging Search Challenge Workshop within the ACM International Conference on Multimedia Retrieval.","DOI":"10.1145\/3210539.3210545"},{"key":"key-10.1145\/3287921.3287955-22","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention Is All You Need. CoRR abs\/1706.03762 (2017). arXiv:1706.03762 http:\/\/arxiv.org\/abs\/1706.03762"},{"key":"key-10.1145\/3287921.3287955-23","doi-asserted-by":"crossref","unstructured":"Oriol Vinyals, Alexander Toshev, Samy Bengio, and Dumitru Erhan. 2015. Show and Tell: A Neural Image Caption Generator. In The IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"key-10.1145\/3287921.3287955-24","unstructured":"Kelvin Xu, Jimmy Ba, Ryan Kiros, Kyunghyun Cho, Aaron Courville, Ruslan Salakhudinov, Rich Zemel, and Yoshua Bengio. 2015. Show, Attend and Tell: Neural Image Caption Generation with Visual Attention. In Proceedings of the 32nd International Conference on Machine Learning (Proceedings of Machine Learning Research), Francis Bach and David Blei (Eds.), Vol. 37. PMLR, Lille, France, 2048--2057."},{"key":"key-10.1145\/3287921.3287955-25","doi-asserted-by":"crossref","unstructured":"Mark Yatskar, Michel Galley, Lucy Vanderwende, and Luke Zettlemoyer. 2014. See No Evil, Say No Evil: Description Generation from Densely Labeled Images. 110&#226;&euro;\"120. https:\/\/www.microsoft.com\/en-us\/research\/publication\/see-no-evil-say-no-evil-description-generation-from-densely-labeled-images\/","DOI":"10.3115\/v1\/S14-1015"},{"key":"key-10.1145\/3287921.3287955-26","doi-asserted-by":"crossref","unstructured":"Quanzeng You, Hailin Jin, Zhaowen Wang, Chen Fang, and Jiebo Luo. 2016. Image Captioning With Semantic Attention. In The IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2016.503"},{"key":"key-10.1145\/3287921.3287955-27","doi-asserted-by":"crossref","unstructured":"Bolei Zhou, Agata Lapedriza, Aditya Khosla, Aude Oliva, and Antonio Torralba. 2017. Places: A 10 million Image Database for Scene Recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence (2017).","DOI":"10.1109\/TPAMI.2017.2723009"}],"event":{"number":"9","sponsor":["SOICT, School of Information and Communication Technology - HUST","NAFOSTED, The National Foundation for Science and Technology Development"],"acronym":"SoICT 2018","name":"the Ninth International Symposium","start":{"date-parts":[[2018,12,6]]},"location":"Danang City, Viet Nam","end":{"date-parts":[[2018,12,7]]}},"container-title":["Proceedings of the Ninth International Symposium on Information and Communication Technology - SoICT 2018"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3287921.3287955","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/dl.acm.org\/ft_gateway.cfm?id=3287955&ftid=2025990&dwn=1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T00:57:54Z","timestamp":1750208274000},"score":1,"resource":{"primary":{"URL":"http:\/\/dl.acm.org\/citation.cfm?doid=3287921.3287955"}},"subtitle":[],"proceedings-subject":"Information and Communication Technology","short-title":[],"issued":{"date-parts":[[2018]]},"references-count":27,"URL":"https:\/\/doi.org\/10.1145\/3287921.3287955","relation":{},"subject":[],"published":{"date-parts":[[2018]]}}}