Skip to content

Commit

Permalink
Merge pull request #54 from bertsky/set-dpi
Browse files Browse the repository at this point in the history
Inform Tesseract of true DPI as detected from source image in all processors (notably improves layout recognition). Also fixes #55
  • Loading branch information
bertsky authored Jul 11, 2019
2 parents cb053c3 + e9c95ba commit 4a69ba1
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 7 deletions.
31 changes: 27 additions & 4 deletions ocrd_tesserocr/crop.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,22 @@
from ocrd_models.ocrd_page import (
MetadataItemType,
LabelsType, LabelType,
CoordsType,
CoordsType, AlternativeImageType,
to_xml
)
from ocrd_models.ocrd_page_generateds import BorderType
from ocrd_models import OcrdExif
from ocrd import Processor

from .config import TESSDATA_PREFIX, OCRD_TOOL
from .common import (
bbox_from_points, points_from_bbox,
bbox_from_xywh
bbox_from_xywh, save_image_file
)

TOOL = 'ocrd-tesserocr-crop'
LOG = getLogger('processor.TesserocrCrop')
FILEGRP_IMG = 'OCR-D-IMG-CROP'

class TesserocrCrop(Processor):

Expand Down Expand Up @@ -87,8 +89,15 @@ def process(self):
min_x, max_x, min_y, max_y)

page_image = self.workspace.resolve_image_as_pil(page.imageFilename)
dpi = page_image.info.get('dpi', (300,300))[0]
zoom = 300 / dpi
page_image_info = OcrdExif(page_image)
if page_image_info.xResolution != 1:
dpi = page_image_info.xResolution
if page_image_info.resolutionUnit == 'cm':
dpi = round(dpi * 2.54)
tessapi.SetVariable('user_defined_dpi', str(dpi))
zoom = 300 / dpi
else:
zoom = 1
LOG.debug("Cropping with tesseract")
tessapi.SetImage(page_image)
# PSM.SPARSE_TEXT: get as much text as possible in no particular order
Expand Down Expand Up @@ -146,7 +155,21 @@ def process(self):
LOG.debug("Padded page border: %i:%i,%i:%i", min_x, max_x, min_y, max_y)
border = BorderType(Coords=CoordsType(
points_from_bbox(min_x, min_y, max_x, max_y)))
# update PAGE (annotate border):
page.set_Border(border)
# update METS (add the image file):
page_image = page_image.crop(
box=(min_x, min_y, max_x, max_y))
file_id = input_file.ID.replace(self.input_file_grp, FILEGRP_IMG)
if file_id == input_file.ID:
file_id = concat_padded(FILEGRP_IMG, n)
file_path = save_image_file(self.workspace, page_image,
file_id,
page_id=page_id,
file_grp=FILEGRP_IMG)
# update PAGE (reference the image file):
page.add_AlternativeImage(AlternativeImageType(
filename=file_path, comments="cropped"))
else:
LOG.error("Cannot find valid extent for page '%s'", page_id)

Expand Down
17 changes: 14 additions & 3 deletions ocrd_tesserocr/deskew.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
TextRegionType, PageType,
to_xml
)
from ocrd_models import OcrdExif
from ocrd import Processor

from .config import TESSDATA_PREFIX, OCRD_TOOL
Expand Down Expand Up @@ -85,6 +86,12 @@ def process(self):
for name in self.parameter.keys()])]))
page = pcgts.get_Page()
page_image = self.workspace.resolve_image_as_pil(page.imageFilename)
page_image_info = OcrdExif(page_image)
if page_image_info.xResolution != 1:
dpi = page_image_info.xResolution
if page_image_info.resolutionUnit == 'cm':
dpi = round(dpi * 2.54)
tessapi.SetVariable('user_defined_dpi', str(dpi))
LOG.info("Deskewing on '%s' level in page '%s'", oplevel, page_id)

page_image, page_xywh = image_from_page(
Expand Down Expand Up @@ -118,7 +125,11 @@ def process(self):
content=to_xml(pcgts))

def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_id):
comments = 'cropped'
if (isinstance(segment, PageType) and
not xywh['x'] and not xywh['y']):
comments = ''
else:
comments = 'cropped'
angle = 0.
tessapi.SetImage(image)
#tessapi.SetPageSegMode(PSM.AUTO_OSD)
Expand All @@ -127,7 +138,7 @@ def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_i
#
osr = tessapi.DetectOrientationScript()
if osr:
assert osr['orient_conf'] and not math.isnan(osr['orient_conf']), \
assert not math.isnan(osr['orient_conf']), \
"orientation detection failed (Tesseract probably compiled without legacy OEM, or osd model not installed)"
if osr['orient_conf'] < 10:
LOG.info('ignoring OSD orientation result %d° due to low confidence %.0f in %s',
Expand All @@ -138,7 +149,7 @@ def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_i
angle = osr['orient_deg']
if angle:
comments += ',rotated-%d' % angle
assert osr['script_conf'] and not math.isnan(osr['script_conf']), \
assert not math.isnan(osr['script_conf']), \
"script detection failed (Tesseract probably compiled without legacy OEM, or osd model not installed)"
if osr['script_conf'] < 10:
LOG.info('ignoring OSD script result "%s" due to low confidence %.0f in %s',
Expand Down
7 changes: 7 additions & 0 deletions ocrd_tesserocr/recognize.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
TextEquivType, TextStyleType,
to_xml)
from ocrd_modelfactory import page_from_file
from ocrd_models import OcrdExif
from ocrd import Processor

from .config import TESSDATA_PREFIX, OCRD_TOOL
Expand Down Expand Up @@ -133,6 +134,12 @@ def process(self):
for name in self.parameter.keys()])]))
page = pcgts.get_Page()
page_image = self.workspace.resolve_image_as_pil(page.imageFilename)
page_image_info = OcrdExif(page_image)
if page_image_info.xResolution != 1:
dpi = page_image_info.xResolution
if page_image_info.resolutionUnit == 'cm':
dpi = round(dpi * 2.54)
tessapi.SetVariable('user_defined_dpi', str(dpi))
page_image, page_xywh = image_from_page(
self.workspace, page, page_image, page_id)
#tessapi.SetImage(page_image)
Expand Down
7 changes: 7 additions & 0 deletions ocrd_tesserocr/segment_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
TextLineType,
to_xml
)
from ocrd_models import OcrdExif

from .config import TESSDATA_PREFIX, OCRD_TOOL
from .common import (
Expand Down Expand Up @@ -71,6 +72,12 @@ def process(self):
for name in self.parameter.keys()])]))
page = pcgts.get_Page()
page_image = self.workspace.resolve_image_as_pil(page.imageFilename)
page_image_info = OcrdExif(page_image)
if page_image_info.xResolution != 1:
dpi = page_image_info.xResolution
if page_image_info.resolutionUnit == 'cm':
dpi = round(dpi * 2.54)
tessapi.SetVariable('user_defined_dpi', str(dpi))
page_image, page_xywh = image_from_page(
self.workspace, page, page_image, page_id)

Expand Down
7 changes: 7 additions & 0 deletions ocrd_tesserocr/segment_region.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
NoiseRegionType,
to_xml)
from ocrd_models.ocrd_page_generateds import TableRegionType
from ocrd_models import OcrdExif
from ocrd import Processor

from .config import TESSDATA_PREFIX, OCRD_TOOL
Expand Down Expand Up @@ -126,6 +127,12 @@ def process(self):
else:
LOG.warning('keeping existing ReadingOrder')
page_image = self.workspace.resolve_image_as_pil(page.imageFilename)
page_image_info = OcrdExif(page_image)
if page_image_info.xResolution != 1:
dpi = page_image_info.xResolution
if page_image_info.resolutionUnit == 'cm':
dpi = round(dpi * 2.54)
tessapi.SetVariable('user_defined_dpi', str(dpi))
page_image, page_xywh = image_from_page(
self.workspace, page, page_image, page_id)
LOG.info("Detecting regions in page '%s'", page_id)
Expand Down
7 changes: 7 additions & 0 deletions ocrd_tesserocr/segment_word.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
WordType,
to_xml
)
from ocrd_models import OcrdExif

from ocrd_tesserocr.config import TESSDATA_PREFIX, OCRD_TOOL
from .common import (
Expand Down Expand Up @@ -71,6 +72,12 @@ def process(self):
for name in self.parameter.keys()])]))
page = pcgts.get_Page()
page_image = self.workspace.resolve_image_as_pil(page.imageFilename)
page_image_info = OcrdExif(page_image)
if page_image_info.xResolution != 1:
dpi = page_image_info.xResolution
if page_image_info.resolutionUnit == 'cm':
dpi = round(dpi * 2.54)
tessapi.SetVariable('user_defined_dpi', str(dpi))
page_image, page_xywh = image_from_page(
self.workspace, page, page_image, page_id)

Expand Down

0 comments on commit 4a69ba1

Please sign in to comment.