{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,13]],"date-time":"2025-11-13T07:25:09Z","timestamp":1763018709483,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":60,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031729195"},{"type":"electronic","value":"9783031729201"}],"license":[{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72920-1_2","type":"book-chapter","created":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T08:02:57Z","timestamp":1727683377000},"page":"21-38","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["ControlCap: Controllable Region-Level Captioning"],"prefix":"10.1007","author":[{"given":"Yuzhong","family":"Zhao","sequence":"first","affiliation":[]},{"given":"Yue","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Zonghao","family":"Guo","sequence":"additional","affiliation":[]},{"given":"Weijia","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Chen","family":"Gong","sequence":"additional","affiliation":[]},{"given":"Qixiang","family":"Ye","sequence":"additional","affiliation":[]},{"given":"Fang","family":"Wan","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,1]]},"reference":[{"key":"2_CR1","unstructured":"Introducing ChatGPT (2022). https:\/\/openai.com\/blog\/chatgpt"},{"key":"2_CR2","unstructured":"Alayrac, J., et al.: Flamingo: a visual language model for few-shot learning. In: NeurIPS (2022)"},{"key":"2_CR3","unstructured":"Brown, T.B., et al.: Language models are few-shot learners. In: NeurIPS (2020)"},{"key":"2_CR4","doi-asserted-by":"crossref","unstructured":"Carlsson, F., \u00d6hman, J., Liu, F., Verlinden, S., Nivre, J., Sahlgren, M.: Fine-grained controllable text generation using non-residual prompting. In: ACL, pp. 6837\u20136857 (2022)","DOI":"10.18653\/v1\/2022.acl-long.471"},{"key":"2_CR5","unstructured":"Chen, J., et al.: MiniGPT-V2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478 (2023)"},{"key":"2_CR6","unstructured":"Chen, K., Zhang, Z., Zeng, W., Zhang, R., Zhu, F., Zhao, R.: Shikra: unleashing multimodal llm\u2019s referential dialogue magic. arXiv preprint arXiv:2306.15195 (2023)"},{"key":"2_CR7","unstructured":"Chung, H.W., et al.: Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 (2022)"},{"key":"2_CR8","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning. arXiv preprint arXiv:2305.06500 (2023)"},{"key":"2_CR9","unstructured":"Dathathri, S., et al.: Plug and play language models: a simple approach to controlled text generation. In: ICLR (2020)"},{"key":"2_CR10","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: IEEE CVPR, pp. 248\u2013255 (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2_CR11","unstructured":"Devlin, J., Chang, M., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Burstein, J., Doran, C., Solorio, T. (eds.) NAACL, pp. 4171\u20134186 (2019)"},{"key":"2_CR12","doi-asserted-by":"crossref","unstructured":"Ding, N., Deng, C., Tan, M., Du, Q., Ge, Z., Wu, Q.: Image captioning with controllable and adaptive length levels. IEEE TPAMI 764\u2013779 (2024)","DOI":"10.1109\/TPAMI.2023.3328298"},{"key":"2_CR13","unstructured":"Dosovitskiy, A., et al.: An image is worth 16$$\\times $$16 words: transformers for image recognition at scale. In: ICLR (2021)"},{"key":"2_CR14","doi-asserted-by":"crossref","unstructured":"Fan, A., Lewis, M., Dauphin, Y.N.: Hierarchical neural story generation. In: Gurevych, I., Miyao, Y. (eds.) ACL, pp. 889\u2013898 (2018)","DOI":"10.18653\/v1\/P18-1082"},{"key":"2_CR15","doi-asserted-by":"crossref","unstructured":"Fang, Y., et al.: EVA: exploring the limits of masked visual representation learning at scale. In: IEEE CVPR, pp. 19358\u201319369 (2023)","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"2_CR16","doi-asserted-by":"crossref","unstructured":"Guo, Q., et al.: RegionGPT: towards region understanding vision language model (2024)","DOI":"10.1109\/CVPR52733.2024.01309"},{"key":"2_CR17","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask R-CNN. In: IEEE ICCV, pp. 2961\u20132969 (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"2_CR18","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: IEEE CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"2_CR19","doi-asserted-by":"crossref","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 1735\u20131780 (1997)","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"2_CR20","doi-asserted-by":"crossref","unstructured":"Hu, Y., Hua, H., Yang, Z., Shi, W., Smith, N.A., Luo, J.: PromptCap: prompt-guided image captioning for VQA with GPT-3. In: IEEE ICCV, pp. 2963\u20132975 (2023)","DOI":"10.1109\/ICCV51070.2023.00277"},{"key":"2_CR21","unstructured":"Huang, X., et al.: Segment and caption anything (2024). https:\/\/arxiv.org\/abs\/2312.00869"},{"key":"2_CR22","unstructured":"Huang, X., et al.: Tag2Text: guiding vision-language model via image tagging. arXiv preprint arXiv:2303.05657 (2023)"},{"key":"2_CR23","doi-asserted-by":"crossref","unstructured":"Johnson, J., Karpathy, A., Fei-Fei, L.: DenseCap: fully convolutional localization networks for dense captioning. In: IEEE CVPR, pp. 4565\u20134574 (2016)","DOI":"10.1109\/CVPR.2016.494"},{"key":"2_CR24","doi-asserted-by":"crossref","unstructured":"Karatzas, D., et\u00a0al.: ICDAR 2015 competition on robust reading. In: IEEE ICDAR, pp. 1156\u20131160 (2015)","DOI":"10.1109\/ICDAR.2015.7333942"},{"key":"2_CR25","doi-asserted-by":"crossref","unstructured":"Kirillov, A., et al.: Segment anything. In: IEEE ICCV, pp. 4015\u20134026 (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"2_CR26","doi-asserted-by":"crossref","unstructured":"Krishna, R., et\u00a0al.: Visual genome: Connecting language and vision using crowdsourced dense image annotations. IJCV 32\u201373 (2017)","DOI":"10.1007\/s11263-016-0981-7"},{"key":"2_CR27","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.C.H.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: ICML, pp. 19730\u201319742 (2023)"},{"key":"2_CR28","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.C.H.: BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: ICML, pp. 12888\u201312900 (2022)"},{"key":"2_CR29","doi-asserted-by":"crossref","unstructured":"Li, P., Zhang, H., Liu, X., Shi, S.: Rigid formats controlled text generation. In: ACL (2020)","DOI":"10.18653\/v1\/2020.acl-main.68"},{"key":"2_CR30","unstructured":"Li, X., Thickstun, J., Gulrajani, I., Liang, P., Hashimoto, T.B.: Diffusion-LM improves controllable text generation. In: NeurIPS (2022)"},{"key":"2_CR31","doi-asserted-by":"crossref","unstructured":"Li, X., Jiang, S., Han, J.: Learning object context for dense captioning. In: AAAI, pp. 8650\u20138657 (2019)","DOI":"10.1609\/aaai.v33i01.33018650"},{"key":"2_CR32","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"2_CR33","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. arXiv preprint arXiv:2304.08485 (2023)"},{"key":"2_CR34","doi-asserted-by":"crossref","unstructured":"Liu, R., Jia, C., Wei, J., Xu, G., Wang, L., Vosoughi, S.: Mitigating political bias in language models through reinforced calibration. In: AAAI, pp. 14857\u201314866 (2021)","DOI":"10.1609\/aaai.v35i17.17744"},{"key":"2_CR35","unstructured":"Liu, S., Zhang, L., Yang, X., Su, H., Zhu, J.: Query2Label: a simple transformer way to multi-label classification. arXiv preprint arXiv:2107.10834 (2021)"},{"key":"2_CR36","doi-asserted-by":"crossref","unstructured":"Long, Y., et al.: CapDet: Unifying dense captioning and open-world detection pretraining. In: IEEE CVPR, pp. 15233\u201315243 (2023)","DOI":"10.1109\/CVPR52729.2023.01462"},{"key":"2_CR37","unstructured":"Peng, Z., et al.: Kosmos-2: grounding multimodal large language models to the world. In: ICLR (2024)"},{"key":"2_CR38","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: ICML, pp. 8748\u20138763 (2021)"},{"key":"2_CR39","doi-asserted-by":"crossref","unstructured":"Rasheed, H., et al.: GLaMM: pixel grounding large multimodal model. In: IEEE CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.01236"},{"key":"2_CR40","doi-asserted-by":"crossref","unstructured":"Ridnik, T., et al.: Asymmetric loss for multi-label classification. In: IEEE CVPR, pp. 82\u201391 (2021)","DOI":"10.1109\/ICCV48922.2021.00015"},{"key":"2_CR41","doi-asserted-by":"crossref","unstructured":"Shao, S., et al.: Objects365: a large-scale, high-quality dataset for object detection. In: IEEE ICCV, pp. 8430\u20138439 (2019)","DOI":"10.1109\/ICCV.2019.00852"},{"key":"2_CR42","doi-asserted-by":"publisher","unstructured":"Shao, Z., Han, J., Debattista, K., Pang, Y.: DCMSTRD: end-to-end dense captioning via multi-scale transformer decoding. IEEE Trans. Multimed. 1\u201313 (2024). https:\/\/doi.org\/10.1109\/TMM.2024.3369863","DOI":"10.1109\/TMM.2024.3369863"},{"key":"2_CR43","unstructured":"Shao, Z., Han, J., Marnerides, D., Debattista, K.: Region-object relation-aware dense captioning via transformer. IEEE TNNLS (2022)"},{"key":"2_CR44","doi-asserted-by":"crossref","unstructured":"Song, H., Wang, Y., Zhang, K., Zhang, W., Liu, T.: Bob: BERT over BERT for training persona-based dialogue models from limited personalized data. In: ACL, pp. 167\u2013177 (2021)","DOI":"10.18653\/v1\/2021.acl-long.14"},{"key":"2_CR45","doi-asserted-by":"crossref","unstructured":"Sun, Z., et al.: Alpha-clip: a clip model focusing on wherever you want. In: IEEE CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.01237"},{"key":"2_CR46","unstructured":"Touvron, H., et al.: LLaMA: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"2_CR47","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS (2017)"},{"key":"2_CR48","unstructured":"Wang, T., et al.: Caption anything: interactive image description with diverse multimodal controls. arXiv preprint arXiv:2305.02677 (2023)"},{"key":"2_CR49","unstructured":"Wang, W., et\u00a0al.: The all-seeing project: towards panoptic visual recognition and understanding of the open world. In: ICLR (2024)"},{"key":"2_CR50","unstructured":"Wu, J., et al.: GRiT: a generative region-to-text transformer for object understanding. arXiv preprint arXiv:2212.00280 (2022)"},{"key":"2_CR51","doi-asserted-by":"crossref","unstructured":"Yang, L., Tang, K., Yang, J., Li, L.J.: Dense captioning with joint inference and visual context. In: IEEE CVPR, pp. 2193\u20132202 (2017)","DOI":"10.1109\/CVPR.2017.214"},{"key":"2_CR52","doi-asserted-by":"crossref","unstructured":"Yin, G., Sheng, L., Liu, B., Yu, N., Wang, X., Shao, J.: Context and attribute grounded dense captioning. In: IEEE CVPR, pp. 6241\u20136250 (2019)","DOI":"10.1109\/CVPR.2019.00640"},{"key":"2_CR53","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/978-3-319-46475-6_5","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Yu","year":"2016","unstructured":"Yu, L., Poirson, P., Yang, S., Berg, A.C., Berg, T.L.: Modeling context in referring expressions. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9906, pp. 69\u201385. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46475-6_5"},{"key":"2_CR54","doi-asserted-by":"crossref","unstructured":"Yu, L., Tan, H., Bansal, M., Berg, T.L.: A joint speaker-listener-reinforcer model for referring expressions. In: IEEE CVPR, pp. 7282\u20137290 (2017)","DOI":"10.1109\/CVPR.2017.375"},{"key":"2_CR55","doi-asserted-by":"crossref","unstructured":"Yu, Q., et al.: CapsFusion: rethinking image-text data at scale. arXiv preprint arXiv:2310.20550 (2023)","DOI":"10.1109\/CVPR52733.2024.01330"},{"key":"2_CR56","doi-asserted-by":"crossref","unstructured":"Yuan, Y., et al.: Osprey: pixel understanding with visual instruction tuning. In: IEEE CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.02664"},{"key":"2_CR57","doi-asserted-by":"crossref","unstructured":"Zhang, H., Song, H., Li, S., Zhou, M., Song, D.: A survey of controllable text generation using transformer-based pre-trained language models. arXiv preprint arXiv:2201.05337 (2022)","DOI":"10.1145\/3617680"},{"key":"2_CR58","unstructured":"Zhang, S., et al.: GPT4RoI: Instruction tuning large language model on region-of-interest. arXiv preprint arXiv:2307.03601 (2023)"},{"key":"2_CR59","unstructured":"Zhang, S., et al.: OPT: open pre-trained transformer language models. arXiv preprint arXiv:2205.01068 (2022)"},{"key":"2_CR60","unstructured":"Zhang, Y., et\u00a0al.: Recognize anything: a strong image tagging model. arXiv preprint arXiv:2306.03514 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72920-1_2","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T21:53:26Z","timestamp":1732830806000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72920-1_2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,1]]},"ISBN":["9783031729195","9783031729201"],"references-count":60,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72920-1_2","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,1]]},"assertion":[{"value":"1 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}