{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T02:00:46Z","timestamp":1772676046157,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,12,15]],"date-time":"2023-12-15T00:00:00Z","timestamp":1702598400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,12,15]]},"DOI":"10.1145\/3627631.3627648","type":"proceedings-article","created":{"date-parts":[[2024,1,31]],"date-time":"2024-01-31T12:08:32Z","timestamp":1706702912000},"page":"1-8","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Dense captioning for Text-Image ReID"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8873-4644","authenticated-orcid":false,"given":"A V","family":"Subramanyam","sequence":"first","affiliation":[{"name":"Department of Electronics &amp; Communications Engineering, Indraprastha Institute of Information Technology Delhi (IIIT-Delhi), IN"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1599-228X","authenticated-orcid":false,"given":"Vibhu","family":"Dubey","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Indraprastha Institute of Information Technology Delhi (IIIT-Delhi), IN"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3962-6261","authenticated-orcid":false,"given":"Niranjan","family":"Sundararajan","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Indraprastha Institute of Information Technology Delhi (IIIT-Delhi), IN"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2677-3071","authenticated-orcid":false,"given":"Brejesh","family":"Lall","sequence":"additional","affiliation":[{"name":"Department of Electrical Engineering, IIT Delhi, India"}]}],"member":"320","published-online":{"date-parts":[[2024,1,31]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Surbhi Aggarwal Venkatesh\u00a0Babu Radhakrishnan and Anirban Chakraborty. 2020. Text-based person search via attribute-aided matching. In WACV. 2617\u20132625.","DOI":"10.1109\/WACV45572.2020.9093640"},{"key":"e_1_3_2_1_2_1","volume-title":"Image-text retrieval: A survey on recent research and development. IJCAI","author":"Cao Min","year":"2022","unstructured":"Min Cao, Shiping Li, Juntao Li, Liqiang Nie, and Min Zhang. 2022. Image-text retrieval: A survey on recent research and development. IJCAI (2022)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Dapeng Chen Hongsheng Li Xihui Liu Yantao Shen Jing Shao Zejian Yuan and Xiaogang Wang. 2018. Improving deep visual representation for person re-identification by global and local image-language association. In ECCV. 54\u201370.","DOI":"10.1007\/978-3-030-01270-0_4"},{"key":"e_1_3_2_1_4_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_5_1","volume-title":"Semantically self-aligned network for text-to-image part-aware person re-identification. arXiv preprint arXiv:2107.12666","author":"Ding Zefeng","year":"2021","unstructured":"Zefeng Ding, Changxing Ding, Zhiyin Shao, and Dacheng Tao. 2021. Semantically self-aligned network for text-to-image part-aware person re-identification. arXiv preprint arXiv:2107.12666 (2021)."},{"key":"e_1_3_2_1_6_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_7_1","volume-title":"Improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:1707.05612","author":"Faghri Fartash","year":"2017","unstructured":"Fartash Faghri, David\u00a0J Fleet, Jamie\u00a0Ryan Kiros, and Sanja Fidler. 2017. Vse++: Improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:1707.05612 (2017)."},{"key":"e_1_3_2_1_8_1","volume-title":"Devise: A deep visual-semantic embedding model. NeurIPS 26","author":"Frome Andrea","year":"2013","unstructured":"Andrea Frome, Greg\u00a0S Corrado, Jon Shlens, Samy Bengio, Jeff Dean, Marc\u2019Aurelio Ranzato, and Tomas Mikolov. 2013. Devise: A deep visual-semantic embedding model. NeurIPS 26 (2013)."},{"key":"e_1_3_2_1_9_1","volume-title":"Contextual non-local alignment over full-scale representation for text-based person search. arXiv preprint arXiv:2101.03036","author":"Gao Chenyang","year":"2021","unstructured":"Chenyang Gao, Guanyu Cai, Xinyang Jiang, Feng Zheng, Jun Zhang, Yifei Gong, Pai Peng, Xiaowei Guo, and Xing Sun. 2021. Contextual non-local alignment over full-scale representation for text-based person search. arXiv preprint arXiv:2101.03036 (2021)."},{"key":"e_1_3_2_1_10_1","unstructured":"Douglas Gray Shane Brennan and Hai Tao. 2007. Evaluating appearance models for recognition reacquisition and tracking. In PETS Vol.\u00a03. 1\u20137."},{"key":"e_1_3_2_1_11_1","volume-title":"The curious case of neural text degeneration. ICLR","author":"Holtzman Ari","year":"2020","unstructured":"Ari Holtzman, Jan Buys, Li Du, Maxwell Forbes, and Yejin Choi. 2020. The curious case of neural text degeneration. ICLR (2020)."},{"key":"e_1_3_2_1_12_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu J","year":"2021","unstructured":"Edward\u00a0J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Yan Huang Wei Wang and Liang Wang. 2017. Instance-aware image and sentence matching with selective multimodal lstm. In CVPR. 2310\u20132318.","DOI":"10.1109\/CVPR.2017.767"},{"key":"e_1_3_2_1_14_1","unstructured":"Chao Jia Yinfei Yang Ye Xia Yi-Ting Chen Zarana Parekh Hieu Pham Quoc Le Yun-Hsuan Sung Zhen Li and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In ICML. PMLR 4904\u20134916."},{"key":"e_1_3_2_1_15_1","volume-title":"Cross-Modal Implicit Relation Reasoning and Aligning for Text-to-Image Person Retrieval. CVPR","author":"Jiang Ding","year":"2023","unstructured":"Ding Jiang and Mang Ye. 2023. Cross-Modal Implicit Relation Reasoning and Aligning for Text-to-Image Person Retrieval. CVPR (2023)."},{"key":"e_1_3_2_1_16_1","volume-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML. PMLR, 12888\u201312900.","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML. PMLR, 12888\u201312900."},{"key":"e_1_3_2_1_17_1","volume-title":"Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557","author":"Li Liunian\u00a0Harold","year":"2019","unstructured":"Liunian\u00a0Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, and Kai-Wei Chang. 2019. Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)."},{"key":"e_1_3_2_1_18_1","unstructured":"Shuang Li Tong Xiao Hongsheng Li Wei Yang and Xiaogang Wang. 2017. Identity-aware textual-visual matching with latent co-attention. In ICCV. 1890\u20131899."},{"key":"e_1_3_2_1_19_1","unstructured":"Shuang Li Tong Xiao Hongsheng Li Bolei Zhou Dayu Yue and Xiaogang Wang. 2017. Person search with natural language description. In CVPR. 1970\u20131979."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Wei Li Rui Zhao and Xiaogang Wang. 2013. Human reidentification with transferred metric learning. In ACCV. 31\u201344.","DOI":"10.1007\/978-3-642-37331-2_3"},{"key":"e_1_3_2_1_21_1","volume-title":"Deepreid: Deep filter pairing neural network for person re-identification. In CVPR. 152\u2013159.","author":"Li Wei","year":"2014","unstructured":"Wei Li, Rui Zhao, Tong Xiao, and Xiaogang Wang. 2014. Deepreid: Deep filter pairing neural network for person re-identification. In CVPR. 152\u2013159."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Jiawei Liu Zheng-Jun Zha Richang Hong Meng Wang and Yongdong Zhang. 2019. Deep adversarial graph attention convolution network for text-based person search. In ACM MM. 665\u2013673.","DOI":"10.1145\/3343031.3350991"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Yu Liu Yanming Guo Erwin\u00a0M Bakker and Michael\u00a0S Lew. 2017. Learning a recurrent residual fusion network for multimodal matching. In ICCV. 4107\u20134116.","DOI":"10.1109\/ICCV.2017.442"},{"key":"e_1_3_2_1_24_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. NeurIPS 32","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. NeurIPS 32 (2019)."},{"key":"e_1_3_2_1_25_1","unstructured":"Alec Radford Jong\u00a0Wook Kim Chris Hallacy Aditya Ramesh Gabriel Goh Sandhini Agarwal Girish Sastry Amanda Askell Pamela Mishkin Jack Clark 2021. Learning transferable visual models from natural language supervision. In ICML. PMLR 8748\u20138763."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2022. High-resolution image synthesis with latent diffusion models. In CVPR. 10684\u201310695.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_27_1","volume-title":"U-net: Convolutional networks for biomedical image segmentation. In MICCAI. 234\u2013241.","author":"Ronneberger Olaf","year":"2015","unstructured":"Olaf Ronneberger, Philipp Fischer, and Thomas Brox. 2015. U-net: Convolutional networks for biomedical image segmentation. In MICCAI. 234\u2013241."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Zhiyin Shao Xinyu Zhang Meng Fang Zhifeng Lin Jian Wang and Changxing Ding. 2022. Learning Granularity-Unified Representations for Text-to-Image Person Re-identification. In ACM MM. 5566\u20135574.","DOI":"10.1145\/3503161.3548028"},{"key":"e_1_3_2_1_29_1","volume-title":"Improved deep metric learning with multi-class n-pair loss objective. NeurIPS 29","author":"Sohn Kihyuk","year":"2016","unstructured":"Kihyuk Sohn. 2016. Improved deep metric learning with multi-class n-pair loss objective. NeurIPS 29 (2016)."},{"key":"e_1_3_2_1_30_1","volume-title":"Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490","author":"Tan Hao","year":"2019","unstructured":"Hao Tan and Mohit Bansal. 2019. Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490 (2019)."},{"key":"e_1_3_2_1_31_1","unstructured":"Hugo Touvron Matthieu Cord Matthijs Douze Francisco Massa Alexandre Sablayrolles and Herv\u00e9 J\u00e9gou. 2021. Training data-efficient image transformers & distillation through attention. In ICML. PMLR 10347\u201310357."},{"key":"e_1_3_2_1_32_1","volume-title":"Attention is all you need. NeurIPS 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. NeurIPS 30 (2017)."},{"key":"e_1_3_2_1_33_1","unstructured":"Yushuang Wu Zizheng Yan Xiaoguang Han Guanbin Li Changqing Zou and Shuguang Cui. 2021. LapsCore: language-guided person search via color reasoning. In ICCV. 1624\u20131633."},{"key":"e_1_3_2_1_34_1","volume-title":"End-to-end deep learning for person search. arXiv preprint arXiv:1604.01850 2, 2","author":"Xiao Tong","year":"2016","unstructured":"Tong Xiao, Shuang Li, Bochao Wang, Liang Lin, and Xiaogang Wang. 2016. End-to-end deep learning for person search. arXiv preprint arXiv:1604.01850 2, 2 (2016), 4."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Ying Zhang and Huchuan Lu. 2018. Deep cross-modal projection learning for image-text matching. In ECCV. 686\u2013701.","DOI":"10.1007\/978-3-030-01246-5_42"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Kecheng Zheng Wu Liu Jiawei Liu Zheng-Jun Zha and Tao Mei. 2020. Hierarchical gumbel attention network for text-based person search. In ACM MM. 3441\u20133449.","DOI":"10.1145\/3394171.3413864"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Liang Zheng Liyue Shen Lu Tian Shengjin Wang Jingdong Wang and Qi Tian. 2015. Scalable person re-identification: A benchmark. In ICCV. 1116\u20131124.","DOI":"10.1109\/ICCV.2015.133"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3383184"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Aichun Zhu Zijie Wang Yifeng Li Xili Wan Jing Jin Tian Wang Fangqiang Hu and Gang Hua. 2021. DSSL: deep surroundings-person separation learning for text-based person retrieval. In ACM MM. 209\u2013217.","DOI":"10.1145\/3474085.3475369"}],"event":{"name":"ICVGIP '23: Indian Conference on Computer Vision, Graphics and Image Processing","location":"Rupnagar India","acronym":"ICVGIP '23"},"container-title":["Proceedings of the Fourteenth Indian Conference on Computer Vision, Graphics and Image Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3627631.3627648","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3627631.3627648","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T19:49:41Z","timestamp":1755892181000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3627631.3627648"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,15]]},"references-count":39,"alternative-id":["10.1145\/3627631.3627648","10.1145\/3627631"],"URL":"https:\/\/doi.org\/10.1145\/3627631.3627648","relation":{},"subject":[],"published":{"date-parts":[[2023,12,15]]},"assertion":[{"value":"2024-01-31","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}