From 41cfa120a344440e3a55dc7bcedfd1e8beddd6e1 Mon Sep 17 00:00:00 2001 From: Robert Schubert Date: Tue, 9 Jul 2019 15:09:22 +0200 Subject: [PATCH 1/5] deskew: relax assertion --- ocrd_tesserocr/deskew.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_tesserocr/deskew.py b/ocrd_tesserocr/deskew.py index ad6bdde..c47cd48 100644 --- a/ocrd_tesserocr/deskew.py +++ b/ocrd_tesserocr/deskew.py @@ -127,7 +127,7 @@ def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_i # osr = tessapi.DetectOrientationScript() if osr: - assert osr['orient_conf'] and not math.isnan(osr['orient_conf']), \ + assert not math.isnan(osr['orient_conf']), \ "orientation detection failed (Tesseract probably compiled without legacy OEM, or osd model not installed)" if osr['orient_conf'] < 10: LOG.info('ignoring OSD orientation result %d° due to low confidence %.0f in %s', @@ -138,7 +138,7 @@ def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_i angle = osr['orient_deg'] if angle: comments += ',rotated-%d' % angle - assert osr['script_conf'] and not math.isnan(osr['script_conf']), \ + assert not math.isnan(osr['script_conf']), \ "script detection failed (Tesseract probably compiled without legacy OEM, or osd model not installed)" if osr['script_conf'] < 10: LOG.info('ignoring OSD script result "%s" due to low confidence %.0f in %s', From 6b5c1ff9bf4fb44cba55cb545175fe97b3e41ab9 Mon Sep 17 00:00:00 2001 From: Robert Schubert Date: Tue, 9 Jul 2019 15:13:25 +0200 Subject: [PATCH 2/5] set parameter user_defined_dpi to DPI value from source image tags --- ocrd_tesserocr/crop.py | 8 ++++++-- ocrd_tesserocr/deskew.py | 3 +++ ocrd_tesserocr/recognize.py | 3 +++ ocrd_tesserocr/segment_line.py | 3 +++ ocrd_tesserocr/segment_region.py | 3 +++ ocrd_tesserocr/segment_word.py | 3 +++ 6 files changed, 21 insertions(+), 2 deletions(-) diff --git a/ocrd_tesserocr/crop.py b/ocrd_tesserocr/crop.py index 534e043..1023655 100644 --- a/ocrd_tesserocr/crop.py +++ b/ocrd_tesserocr/crop.py @@ -87,8 +87,12 @@ def process(self): min_x, max_x, min_y, max_y) page_image = self.workspace.resolve_image_as_pil(page.imageFilename) - dpi = page_image.info.get('dpi', (300,300))[0] - zoom = 300 / dpi + dpi = page_image.info.get('dpi', (0,0))[0] + if dpi: + tessapi.SetVariable('user_defined_dpi', str(dpi)) + zoom = 300 / dpi + else: + zoom = 1 LOG.debug("Cropping with tesseract") tessapi.SetImage(page_image) # PSM.SPARSE_TEXT: get as much text as possible in no particular order diff --git a/ocrd_tesserocr/deskew.py b/ocrd_tesserocr/deskew.py index c47cd48..99c75ea 100644 --- a/ocrd_tesserocr/deskew.py +++ b/ocrd_tesserocr/deskew.py @@ -85,6 +85,9 @@ def process(self): for name in self.parameter.keys()])])) page = pcgts.get_Page() page_image = self.workspace.resolve_image_as_pil(page.imageFilename) + dpi = page_image.info.get('dpi', (0,0))[0] + if dpi: + tessapi.SetVariable('user_defined_dpi', str(dpi)) LOG.info("Deskewing on '%s' level in page '%s'", oplevel, page_id) page_image, page_xywh = image_from_page( diff --git a/ocrd_tesserocr/recognize.py b/ocrd_tesserocr/recognize.py index 959200e..399aab2 100644 --- a/ocrd_tesserocr/recognize.py +++ b/ocrd_tesserocr/recognize.py @@ -133,6 +133,9 @@ def process(self): for name in self.parameter.keys()])])) page = pcgts.get_Page() page_image = self.workspace.resolve_image_as_pil(page.imageFilename) + dpi = page_image.info.get('dpi', (0,0))[0] + if dpi: + tessapi.SetVariable('user_defined_dpi', str(dpi)) page_image, page_xywh = image_from_page( self.workspace, page, page_image, page_id) #tessapi.SetImage(page_image) diff --git a/ocrd_tesserocr/segment_line.py b/ocrd_tesserocr/segment_line.py index 4890237..ef54a59 100644 --- a/ocrd_tesserocr/segment_line.py +++ b/ocrd_tesserocr/segment_line.py @@ -71,6 +71,9 @@ def process(self): for name in self.parameter.keys()])])) page = pcgts.get_Page() page_image = self.workspace.resolve_image_as_pil(page.imageFilename) + dpi = page_image.info.get('dpi', (0,0))[0] + if dpi: + tessapi.SetVariable('user_defined_dpi', str(dpi)) page_image, page_xywh = image_from_page( self.workspace, page, page_image, page_id) diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py index e131ebb..f846f65 100644 --- a/ocrd_tesserocr/segment_region.py +++ b/ocrd_tesserocr/segment_region.py @@ -126,6 +126,9 @@ def process(self): else: LOG.warning('keeping existing ReadingOrder') page_image = self.workspace.resolve_image_as_pil(page.imageFilename) + dpi = page_image.info.get('dpi', (0,0))[0] + if dpi: + tessapi.SetVariable('user_defined_dpi', str(dpi)) page_image, page_xywh = image_from_page( self.workspace, page, page_image, page_id) LOG.info("Detecting regions in page '%s'", page_id) diff --git a/ocrd_tesserocr/segment_word.py b/ocrd_tesserocr/segment_word.py index 38b0551..1918c71 100644 --- a/ocrd_tesserocr/segment_word.py +++ b/ocrd_tesserocr/segment_word.py @@ -71,6 +71,9 @@ def process(self): for name in self.parameter.keys()])])) page = pcgts.get_Page() page_image = self.workspace.resolve_image_as_pil(page.imageFilename) + dpi = page_image.info.get('dpi', (0,0))[0] + if dpi: + tessapi.SetVariable('user_defined_dpi', str(dpi)) page_image, page_xywh = image_from_page( self.workspace, page, page_image, page_id) From 01c020807b18a007336374091ba817227983da8e Mon Sep 17 00:00:00 2001 From: Robert Schubert Date: Tue, 9 Jul 2019 17:44:58 +0200 Subject: [PATCH 3/5] deskew: omit 'cropped' from comments unless the segment was actually cropped --- ocrd_tesserocr/deskew.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ocrd_tesserocr/deskew.py b/ocrd_tesserocr/deskew.py index 99c75ea..0a10b1e 100644 --- a/ocrd_tesserocr/deskew.py +++ b/ocrd_tesserocr/deskew.py @@ -121,7 +121,11 @@ def process(self): content=to_xml(pcgts)) def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_id): - comments = 'cropped' + if (isinstance(segment, PageType) and + not xywh['x'] and not xywh['y']): + comments = '' + else: + comments = 'cropped' angle = 0. tessapi.SetImage(image) #tessapi.SetPageSegMode(PSM.AUTO_OSD) From 433f326f567a327d2d2ff6ed5972210b70d809bd Mon Sep 17 00:00:00 2001 From: Robert Schubert Date: Tue, 9 Jul 2019 17:46:13 +0200 Subject: [PATCH 4/5] crop: create AlternativeImage when successful (to prevent inconsistent annotation in case AlternativeImage was already present) --- ocrd_tesserocr/crop.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/ocrd_tesserocr/crop.py b/ocrd_tesserocr/crop.py index 1023655..b8a3ded 100644 --- a/ocrd_tesserocr/crop.py +++ b/ocrd_tesserocr/crop.py @@ -10,7 +10,7 @@ from ocrd_models.ocrd_page import ( MetadataItemType, LabelsType, LabelType, - CoordsType, + CoordsType, AlternativeImageType, to_xml ) from ocrd_models.ocrd_page_generateds import BorderType @@ -19,11 +19,12 @@ from .config import TESSDATA_PREFIX, OCRD_TOOL from .common import ( bbox_from_points, points_from_bbox, - bbox_from_xywh + bbox_from_xywh, save_image_file ) TOOL = 'ocrd-tesserocr-crop' LOG = getLogger('processor.TesserocrCrop') +FILEGRP_IMG = 'OCR-D-IMG-CROP' class TesserocrCrop(Processor): @@ -150,7 +151,21 @@ def process(self): LOG.debug("Padded page border: %i:%i,%i:%i", min_x, max_x, min_y, max_y) border = BorderType(Coords=CoordsType( points_from_bbox(min_x, min_y, max_x, max_y))) + # update PAGE (annotate border): page.set_Border(border) + # update METS (add the image file): + page_image = page_image.crop( + box=(min_x, min_y, max_x, max_y)) + file_id = input_file.ID.replace(self.input_file_grp, FILEGRP_IMG) + if file_id == input_file.ID: + file_id = concat_padded(FILEGRP_IMG, n) + file_path = save_image_file(self.workspace, page_image, + file_id, + page_id=page_id, + file_grp=FILEGRP_IMG) + # update PAGE (reference the image file): + page.add_AlternativeImage(AlternativeImageType( + filename=file_path, comments="cropped")) else: LOG.error("Cannot find valid extent for page '%s'", page_id) From e9c95ba985994fa6d60d29c83ebb762f377bd112 Mon Sep 17 00:00:00 2001 From: Robert Schubert Date: Thu, 11 Jul 2019 13:23:01 +0200 Subject: [PATCH 5/5] make DPI calculation based on OcrdExif abstraction instead of PIL --- ocrd_tesserocr/crop.py | 8 ++++++-- ocrd_tesserocr/deskew.py | 8 ++++++-- ocrd_tesserocr/recognize.py | 8 ++++++-- ocrd_tesserocr/segment_line.py | 8 ++++++-- ocrd_tesserocr/segment_region.py | 8 ++++++-- ocrd_tesserocr/segment_word.py | 8 ++++++-- 6 files changed, 36 insertions(+), 12 deletions(-) diff --git a/ocrd_tesserocr/crop.py b/ocrd_tesserocr/crop.py index b8a3ded..e63e08c 100644 --- a/ocrd_tesserocr/crop.py +++ b/ocrd_tesserocr/crop.py @@ -14,6 +14,7 @@ to_xml ) from ocrd_models.ocrd_page_generateds import BorderType +from ocrd_models import OcrdExif from ocrd import Processor from .config import TESSDATA_PREFIX, OCRD_TOOL @@ -88,8 +89,11 @@ def process(self): min_x, max_x, min_y, max_y) page_image = self.workspace.resolve_image_as_pil(page.imageFilename) - dpi = page_image.info.get('dpi', (0,0))[0] - if dpi: + page_image_info = OcrdExif(page_image) + if page_image_info.xResolution != 1: + dpi = page_image_info.xResolution + if page_image_info.resolutionUnit == 'cm': + dpi = round(dpi * 2.54) tessapi.SetVariable('user_defined_dpi', str(dpi)) zoom = 300 / dpi else: diff --git a/ocrd_tesserocr/deskew.py b/ocrd_tesserocr/deskew.py index 0a10b1e..6981d7c 100644 --- a/ocrd_tesserocr/deskew.py +++ b/ocrd_tesserocr/deskew.py @@ -22,6 +22,7 @@ TextRegionType, PageType, to_xml ) +from ocrd_models import OcrdExif from ocrd import Processor from .config import TESSDATA_PREFIX, OCRD_TOOL @@ -85,8 +86,11 @@ def process(self): for name in self.parameter.keys()])])) page = pcgts.get_Page() page_image = self.workspace.resolve_image_as_pil(page.imageFilename) - dpi = page_image.info.get('dpi', (0,0))[0] - if dpi: + page_image_info = OcrdExif(page_image) + if page_image_info.xResolution != 1: + dpi = page_image_info.xResolution + if page_image_info.resolutionUnit == 'cm': + dpi = round(dpi * 2.54) tessapi.SetVariable('user_defined_dpi', str(dpi)) LOG.info("Deskewing on '%s' level in page '%s'", oplevel, page_id) diff --git a/ocrd_tesserocr/recognize.py b/ocrd_tesserocr/recognize.py index 399aab2..b205dab 100644 --- a/ocrd_tesserocr/recognize.py +++ b/ocrd_tesserocr/recognize.py @@ -18,6 +18,7 @@ TextEquivType, TextStyleType, to_xml) from ocrd_modelfactory import page_from_file +from ocrd_models import OcrdExif from ocrd import Processor from .config import TESSDATA_PREFIX, OCRD_TOOL @@ -133,8 +134,11 @@ def process(self): for name in self.parameter.keys()])])) page = pcgts.get_Page() page_image = self.workspace.resolve_image_as_pil(page.imageFilename) - dpi = page_image.info.get('dpi', (0,0))[0] - if dpi: + page_image_info = OcrdExif(page_image) + if page_image_info.xResolution != 1: + dpi = page_image_info.xResolution + if page_image_info.resolutionUnit == 'cm': + dpi = round(dpi * 2.54) tessapi.SetVariable('user_defined_dpi', str(dpi)) page_image, page_xywh = image_from_page( self.workspace, page, page_image, page_id) diff --git a/ocrd_tesserocr/segment_line.py b/ocrd_tesserocr/segment_line.py index ef54a59..e54fb68 100644 --- a/ocrd_tesserocr/segment_line.py +++ b/ocrd_tesserocr/segment_line.py @@ -17,6 +17,7 @@ TextLineType, to_xml ) +from ocrd_models import OcrdExif from .config import TESSDATA_PREFIX, OCRD_TOOL from .common import ( @@ -71,8 +72,11 @@ def process(self): for name in self.parameter.keys()])])) page = pcgts.get_Page() page_image = self.workspace.resolve_image_as_pil(page.imageFilename) - dpi = page_image.info.get('dpi', (0,0))[0] - if dpi: + page_image_info = OcrdExif(page_image) + if page_image_info.xResolution != 1: + dpi = page_image_info.xResolution + if page_image_info.resolutionUnit == 'cm': + dpi = round(dpi * 2.54) tessapi.SetVariable('user_defined_dpi', str(dpi)) page_image, page_xywh = image_from_page( self.workspace, page, page_image, page_id) diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py index f846f65..eff4520 100644 --- a/ocrd_tesserocr/segment_region.py +++ b/ocrd_tesserocr/segment_region.py @@ -26,6 +26,7 @@ NoiseRegionType, to_xml) from ocrd_models.ocrd_page_generateds import TableRegionType +from ocrd_models import OcrdExif from ocrd import Processor from .config import TESSDATA_PREFIX, OCRD_TOOL @@ -126,8 +127,11 @@ def process(self): else: LOG.warning('keeping existing ReadingOrder') page_image = self.workspace.resolve_image_as_pil(page.imageFilename) - dpi = page_image.info.get('dpi', (0,0))[0] - if dpi: + page_image_info = OcrdExif(page_image) + if page_image_info.xResolution != 1: + dpi = page_image_info.xResolution + if page_image_info.resolutionUnit == 'cm': + dpi = round(dpi * 2.54) tessapi.SetVariable('user_defined_dpi', str(dpi)) page_image, page_xywh = image_from_page( self.workspace, page, page_image, page_id) diff --git a/ocrd_tesserocr/segment_word.py b/ocrd_tesserocr/segment_word.py index 1918c71..e39730b 100644 --- a/ocrd_tesserocr/segment_word.py +++ b/ocrd_tesserocr/segment_word.py @@ -17,6 +17,7 @@ WordType, to_xml ) +from ocrd_models import OcrdExif from ocrd_tesserocr.config import TESSDATA_PREFIX, OCRD_TOOL from .common import ( @@ -71,8 +72,11 @@ def process(self): for name in self.parameter.keys()])])) page = pcgts.get_Page() page_image = self.workspace.resolve_image_as_pil(page.imageFilename) - dpi = page_image.info.get('dpi', (0,0))[0] - if dpi: + page_image_info = OcrdExif(page_image) + if page_image_info.xResolution != 1: + dpi = page_image_info.xResolution + if page_image_info.resolutionUnit == 'cm': + dpi = round(dpi * 2.54) tessapi.SetVariable('user_defined_dpi', str(dpi)) page_image, page_xywh = image_from_page( self.workspace, page, page_image, page_id)