diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 909b89d5cc9..4c04d9e4b70 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -285,24 +285,10 @@ def __ocr(self, pagenum, img, chars, ZM=3): "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]], self.mean_height[-1] / 3 ) - - # solve char content confusion - record_error_length, ct = 0, 0.001 - for c in chars[0:128]: - ii = Recognizer.find_overlapped(c, bxs) - if ii is None: - continue - record_error_length += abs((bxs[ii]["bottom"] + bxs[ii]["top"] - c["bottom"] - c["top"]) / 2) - ct += 1 - - record_error_length = record_error_length / ct - for char in chars: - char["top"] -= record_error_length - char["bottom"] -= record_error_length # merge chars in the same rect - for c in Recognizer.sort_X_firstly( - chars, self.mean_width[pagenum - 1] // 4): + for c in Recognizer.sort_Y_firstly( + chars, self.mean_height[pagenum - 1] // 4): ii = Recognizer.find_overlapped(c, bxs) if ii is None: self.lefted_chars.append(c)