{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T17:13:52Z","timestamp":1743009232504,"version":"3.40.3"},"publisher-location":"Cham","reference-count":29,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031416781"},{"type":"electronic","value":"9783031416798"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-41679-8_16","type":"book-chapter","created":{"date-parts":[[2023,8,18]],"date-time":"2023-08-18T07:02:59Z","timestamp":1692342179000},"page":"280-294","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Re-Thinking Text Clustering for\u00a0Images with\u00a0Text"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-5656-3490","authenticated-orcid":false,"given":"Shwet Kamal","family":"Mishra","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8201-2253","authenticated-orcid":false,"given":"Soham","family":"Joshi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7813-877X","authenticated-orcid":false,"given":"Viswanath","family":"Gopalakrishnan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,8,19]]},"reference":[{"key":"16_CR1","doi-asserted-by":"crossref","unstructured":"Singh, A., et al.: Towards VQA models that can read. arXiv 2019, arxiv.org\/abs\/1904.08920","DOI":"10.1109\/CVPR.2019.00851"},{"key":"16_CR2","unstructured":"Krasin, I., et al.: OpenImages: a public dataset for large-scale multi-label and multi-class image classification (2016)"},{"key":"16_CR3","doi-asserted-by":"crossref","unstructured":"Karatzas, D., et al.: ICDAR 2013 robust reading competition. In: 2013 12th International Conference on Document Analysis and Recognition, pp. 1484\u20131493 (2013)","DOI":"10.1109\/ICDAR.2013.221"},{"key":"16_CR4","doi-asserted-by":"crossref","unstructured":"Karatzas, D., et al.: ICDAR 2015 competition on Robust Reading (2015)","DOI":"10.1109\/ICDAR.2015.7333942"},{"key":"16_CR5","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255 (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"16_CR6","doi-asserted-by":"crossref","unstructured":"Gurari, D., et al.: VizWiz grand challenge: answering visual questions from blind people. arXiv 2018, arxiv.org\/abs\/1802.08218","DOI":"10.1109\/CVPR.2018.00380"},{"key":"16_CR7","doi-asserted-by":"crossref","unstructured":"Mishra, A., Alahari, K., Jawahar, C.: Image retrieval using textual cues. In: 2013 IEEE International Conference On Computer Vision, pp. 3040\u20133047 (2013)","DOI":"10.1109\/ICCV.2013.378"},{"key":"16_CR8","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. arXiv 2016, arxiv.org\/abs\/1602.07332"},{"key":"16_CR9","unstructured":"Veit, A., Matera, T., Neumann, L., Matas, J., Belongie, S.: COCO-text: dataset and benchmark for text detection and recognition in natural images. arXiv 2016, arxiv.org\/abs\/1601.07140"},{"key":"16_CR10","doi-asserted-by":"crossref","unstructured":"Mishra, A., Shekhar, S., Singh, A., Chakraborty, A.: OCR-VQA: visual question answering by reading text in images. In: ICDAR (2019)","DOI":"10.1109\/ICDAR.2019.00156"},{"key":"16_CR11","unstructured":"Iwana, B., Rizvi, S., Ahmed, S., Dengel, A., Uchida, S.: Judging a book by its cover. arXiv 2016, arxiv.org\/abs\/1610.09204"},{"key":"16_CR12","doi-asserted-by":"crossref","unstructured":"Hu, R., Singh, A., Darrell, T., Rohrbach, M.: Iterative answer prediction with pointer-augmented multimodal transformers for TextVQA. arXiv 2019, arxiv.org\/abs\/1911.06258","DOI":"10.1109\/CVPR42600.2020.01001"},{"key":"16_CR13","unstructured":"Vaswani, A., et al.: Attention is all you need. arXiv 2017, arxiv.org\/abs\/1706.03762"},{"key":"16_CR14","unstructured":"Devlin, J., Chang, M., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv 2018, arxiv.org\/abs\/1810.04805"},{"key":"16_CR15","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. arXiv 2015, arxiv.org\/abs\/1506.01497"},{"key":"16_CR16","doi-asserted-by":"crossref","unstructured":"Bojanowski, P., Grave, E., Joulin, A., Mikolov, T.: Enriching word vectors with subword information. arXiv 2016, arxiv.org\/abs\/1607.04606","DOI":"10.1162\/tacl_a_00051"},{"key":"16_CR17","doi-asserted-by":"publisher","first-page":"2552","DOI":"10.1109\/TPAMI.2014.2339814","volume":"36","author":"J Almaz\u00e1n","year":"2014","unstructured":"Almaz\u00e1n, J., Gordo, A., Forn\u00e9s, A., Valveny, E.: Word spotting and recognition with embedded attributes. IEEE Trans. Pattern Anal. Mach. Intell. 36, 2552\u20132566 (2014)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"16_CR18","doi-asserted-by":"crossref","unstructured":"Biten, A., et al.: Scene text visual question answering. arXiv 2019, arxiv.org\/abs\/1905.13648","DOI":"10.1109\/ICCV.2019.00439"},{"key":"16_CR19","unstructured":"Jiang, Y., Natarajan, V., Chen, X., Rohrbach, M., Batra, D., Parikh, D.: Pythia v0.1: the winning entry to the VQA challenge 2018. arXiv 2018, arxiv.org\/abs\/1807.09956"},{"key":"16_CR20","doi-asserted-by":"crossref","unstructured":"Biten, A., et al.: ICDAR 2019 competition on scene text visual question answering (2019)","DOI":"10.1109\/ICCV.2019.00439"},{"key":"16_CR21","doi-asserted-by":"crossref","unstructured":"Gao, D., Li, K., Wang, R., Shan, S., Chen, X.: Multi-modal graph neural network for joint reasoning on vision and scene text. arXiv 2020, arxiv.org\/abs\/2003.13962","DOI":"10.1109\/CVPR42600.2020.01276"},{"key":"16_CR22","doi-asserted-by":"crossref","unstructured":"Kant, Y., et al.: Spatially aware multimodal transformers for TextVQA. arXiv 2020, arxiv.org\/abs\/2007.12146","DOI":"10.1007\/978-3-030-58545-7_41"},{"key":"16_CR23","unstructured":"Chen, X., et al.: PaLI: a jointly-scaled multilingual language-image model. arXiv 2022, arxiv.org\/abs\/2209.06794"},{"key":"16_CR24","unstructured":"Wang, J., et al.: GIT: a generative image-to-text transformer for vision and language. arXiv 2022, arxiv.org\/abs\/2205.14100"},{"key":"16_CR25","doi-asserted-by":"crossref","unstructured":"Kil, J., et al.: PreSTU: pre-training for scene-text understanding. arXiv 2022, arxiv.org\/abs\/2209.05534","DOI":"10.1109\/ICCV51070.2023.01401"},{"key":"16_CR26","doi-asserted-by":"crossref","unstructured":"Lu, X., Fan, Z., Wang, Y., Oh, J., Rose, C.: Localize, group, and select: boosting text-VQA by scene text modeling (2021). arxiv.org\/abs\/2108.08965","DOI":"10.1109\/ICCVW54120.2021.00297"},{"key":"16_CR27","doi-asserted-by":"crossref","unstructured":"Singh, A., Mishra, A., Shekhar, S., Chakraborty, A.: From strings to things: knowledge-enabled VQA Model that can Read and Reason. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00470"},{"key":"16_CR28","unstructured":"Borisyuk, F., Gordo, A., Sivakumar, V.: Rosetta: large scale system for text detection and recognition in images. CoRR abs\/1910.05085 (2019). arxiv.org\/abs\/1910.05085"},{"key":"16_CR29","unstructured":"Ester, M., Kriegel, H., Sander, J., Xu, X.: A density-based algorithm for discovering clusters in large spatial databases with noise. In: Knowledge Discovery and Data Mining (1996)"}],"container-title":["Lecture Notes in Computer Science","Document Analysis and Recognition - ICDAR 2023"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-41679-8_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T09:42:39Z","timestamp":1729935759000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-41679-8_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031416781","9783031416798"],"references-count":29,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-41679-8_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"19 August 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICDAR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Document Analysis and Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"San Jos\u00e9, CA","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"USA","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21 August 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"26 August 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdar2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icdar2023.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Easychair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"316","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"154","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"49% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.89","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1.50","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Number and type of other papers accepted : IJDAR track papers","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}