From 7d5888c2f189fc77f6f768a1fe505b30fe02af5d Mon Sep 17 00:00:00 2001 From: H <43509927+guoyuhao2330@users.noreply.github.com> Date: Thu, 11 Jul 2024 11:01:55 +0800 Subject: [PATCH 1/2] Update pdf_parser.py --- deepdoc/parser/pdf_parser.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 5bb9bffac06..a12fc343da4 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -286,6 +286,20 @@ def __ocr(self, pagenum, img, chars, ZM=3): self.mean_height[-1] / 3 ) + # solve char content confusion + record_error_length = 0 + for c in Recognizer.sort_X_firstly(chars, self.mean_width[pagenum - 1] // 4)[0:128]: + ii = Recognizer.find_overlapped(c, bxs) + if ii is None: + self.lefted_chars.append(c) + continue + record_error_length += abs((bxs[ii]["bottom"] + bxs[ii]["bottom"] - c["bottom"] - c["top"]) / 2) + + record_error_length = record_error_length / 128 + for char in chars: + char["top"] -= record_error_length + char["bottom"] -= record_error_length + # merge chars in the same rect for c in Recognizer.sort_X_firstly( chars, self.mean_width[pagenum - 1] // 4): From c5518a5329d59b25c20bb5e1df1d691fc3b7d0a8 Mon Sep 17 00:00:00 2001 From: H <43509927+guoyuhao2330@users.noreply.github.com> Date: Thu, 11 Jul 2024 12:07:00 +0800 Subject: [PATCH 2/2] Update pdf_parser.py --- deepdoc/parser/pdf_parser.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index a12fc343da4..fce9e91de4c 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -288,10 +288,9 @@ def __ocr(self, pagenum, img, chars, ZM=3): # solve char content confusion record_error_length = 0 - for c in Recognizer.sort_X_firstly(chars, self.mean_width[pagenum - 1] // 4)[0:128]: + for c in chars[0:128]: ii = Recognizer.find_overlapped(c, bxs) if ii is None: - self.lefted_chars.append(c) continue record_error_length += abs((bxs[ii]["bottom"] + bxs[ii]["bottom"] - c["bottom"] - c["top"]) / 2)