-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathExtractRegions.h
50 lines (38 loc) · 1.62 KB
/
ExtractRegions.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#ifndef __figureextractor__ExtractRegions__
#define __figureextractor__ExtractRegions__
#include <vector>
#include <unordered_map>
#include <poppler/TextOutputDev.h>
#include <leptonica/allheaders.h>
#include "PDFUtils.h"
#include "ExtractCaptions.h"
/**
Module for dividing up an image of a pdf file into
different regions and classifying those regions.
*/
class PageRegions {
public:
std::vector<Caption> captions; // Captions in the page
BOXA *bodytext; // Regions of body text,
BOXA *graphics; // Regions of graphics, should be high recall
BOXA *other; // Any regions that have content were not catagorized either way
PageRegions(std::vector<Caption> captions, BOXA *bodytext, BOXA *graphics,
BOXA *other)
: captions(captions), bodytext(bodytext), graphics(graphics),
other(other) {}
PIX *drawRegions(PIX *background);
PIX *drawRegionsBW(PIX *background);
BOXA *getCaptionsBoxa();
};
/**
Given an image of a PDF page (orginal) the text (text) a PIX contain
only the grpahics (graphics) and the locations of the caption starts
of the page (captionStarts), returns a PageRegions object that breaks the
PDF page in text, graphic, caption, or other regions. Does not take
ownerish of any arguements.
*/
PageRegions getPageRegions(PIX *original, TextPage *text, PIX *graphics,
const std::vector<Caption> &captionStarts,
DocumentStatistics &docStats, int page, bool verbose,
bool showSteps, std::vector<Figure> &errors);
#endif /* defined(__figureextractor__ExtractRegions__) */