-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Replace OCRD-ZIP with BagIt-based spec #70
Changes from all commits
6279cb1
732cbff
93bb4be
46ec5e9
920c279
713974f
fda032f
c7746f5
0fdbe22
d4be7d9
5460f10
251a77c
c976224
90721da
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"Bagit-Profile-Info":{"Bagit-Profile-Identifier":"https://ocr-d.github.io/bagit-profile.json","Source-Organization":"OCR-D","External-Description":"BagIt profile for OCR data","Version":0.1},"Bag-Info":{"Bagging-Date":{"required":false},"Source-Organization":{"required":false},"Ocrd-Mets":{"required":false,"default":"data/mets.xml"},"Ocrd-Manifestation-Depth":{"required":false,"default":"partial","values":["partial","full"]},"Ocrd-Identifier":{"required":true},"Ocrd-Checksum":{"required":false,"default":"cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e"}},"Manifests-Required":["sha512"],"Allow-Fetch.txt":false,"Serialization":"required","Accept-Serialization":"application/zip","Accept-BagIt-Version":[1]} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
Bagit-Profile-Info: | ||
Bagit-Profile-Identifier: https://ocr-d.github.io/bagit-profile.json | ||
Source-Organization: OCR-D | ||
External-Description: BagIt profile for OCR data | ||
Version: 0.1 | ||
Bag-Info: | ||
Bagging-Date: | ||
required: false | ||
Source-Organization: | ||
required: false | ||
Ocrd-Mets: | ||
required: false | ||
default: 'data/mets.xml' | ||
Ocrd-Manifestation-Depth: | ||
required: false | ||
default: partial | ||
values: ["partial", "full"] | ||
Ocrd-Identifier: | ||
required: true | ||
Ocrd-Checksum: | ||
required: false | ||
# echo -n | sha512sum | ||
default: 'cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e' | ||
Manifests-Required: | ||
- sha512 | ||
Allow-Fetch.txt: false | ||
Serialization: required | ||
Accept-Serialization: application/zip | ||
Accept-BagIt-Version: | ||
- 1.0 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"type":"object","description":"Schema for tools by OCR-D MP","required":["version","git_url","tools"],"additionalProperties":false,"properties":{"version":{"description":"Version of the tool, expressed as MAJOR.MINOR.PATCH.","type":"string","pattern":"^[0-9]+\\.[0-9]+\\.[0-9]+$"},"git_url":{"description":"Github/Gitlab URL","type":"string","format":"url"},"dockerhub":{"description":"DockerHub image","type":"string"},"tools":{"type":"object","additionalProperties":false,"patternProperties":{"ocrd-.*":{"type":"object","additionalProperties":false,"required":["description","steps","executable","categories","input_file_grp","output_file_grp"],"properties":{"executable":{"description":"The name of the CLI executable in $PATH","type":"string"},"input_file_grp":{"description":"Input fileGrp@USE this tool expects by default","type":"array","items":{"type":"string","pattern":"^OCR-D-[A-Z0-9-]+$"}},"output_file_grp":{"description":"Output fileGrp@USE this tool produces by default","type":"array","items":{"type":"string","pattern":"^OCR-D-[A-Z0-9-]+$"}},"parameters":{"description":"Object describing the parameters of a tool. Keys are parameter names, values sub-schemas.","type":"object","patternProperties":{".*":{"type":"object","additionalProperties":false,"required":["description","type"],"properties":{"type":{"type":"string","description":"Data type of this parameter","enum":["string","number","boolean"]},"format":{"description":"Subtype, such as `float` for type `number` or `uri` for type `string`."},"description":{"description":"Concise description of syntax and semantics of this parameter"},"required":{"type":"boolean","description":"Whether this parameter is required"},"default":{"description":"Default value when not provided by the user"},"enum":{"type":"array","description":"List the allowed values if a fixed list."},"content-type":{"type":"string","description":"If parameter is reference to file: Media type of the file","pattern":"^[a-z0-9\\._-]+/[A-Za-z0-9\\._\\+-]+$"},"cacheable":{"type":"boolean","description":"If parameter is reference to file: Whether the file should be cached, e.g. because it is large and won't change.","default":false}}}}},"description":{"description":"Concise description what the tool does"},"categories":{"description":"Tools belong to this categories, representing modules within the OCR-D project structure","type":"array","items":{"type":"string","enum":["Image preprocessing","Layout analysis","Text recognition and optimization","Model training","Long-term preservation","Quality assurance"]}},"steps":{"description":"This tool can be used at these steps in the OCR-D functional model","type":"array","items":{"type":"string","enum":["preprocessing/characterization","preprocessing/optimization","preprocessing/optimization/cropping","preprocessing/optimization/deskewing","preprocessing/optimization/despeckling","preprocessing/optimization/dewarping","preprocessing/optimization/binarization","preprocessing/optimization/grayscale_normalization","recognition/text-recognition","recognition/font-identification","recognition/post-correction","layout/segmentation","layout/segmentation/region","layout/segmentation/line","layout/segmentation/word","layout/segmentation/classification","layout/analysis"]}}}}}}}} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
# OCRD-ZIP | ||
|
||
This document describes an exchange format to bundle a workspace described by a | ||
[METS file following OCR-D's conventions](mets). | ||
[METS file following OCR-D's conventions](/mets). | ||
|
||
## Rationale | ||
|
||
|
@@ -10,77 +10,179 @@ files such as images and metadata about those images such as PAGE or ALTO | |
files. METS is a textual format, not suitable for embedding arbitrary, | ||
potentially binary, data. For various use cases (such as transfer via network, | ||
long-term preservation, reproducible tests etc.) it is desirable to have a | ||
self-contained representation of a workspace. With such a representation, data | ||
producers are not forced to provide dereferencable HTTP-URL for the files they | ||
produce and data consumers are not forced to dereference all HTTP-URL. | ||
self-contained representation of a [workspace](/mets). | ||
|
||
With such a representation, data producers are not forced to provide | ||
dereferencable HTTP-URL for the files they produce and data consumers are not | ||
forced to dereference all HTTP-URL. | ||
|
||
While METS does have mechanisms for embedding XML data and even base64-encoded | ||
binary data, the tradeoffs in file size, parsing speed and readability are too | ||
great to make this a viable solution for a mass digitization scenario. | ||
|
||
Instead, OCRD-ZIP is based on the widely used ZIP format which allows | ||
representing file hierarchies in a standardized, compressable archive format. | ||
Many formats like JAR (used in software development) and BagIt (used in | ||
long-term preservation) use the same principles: A zip file containing a | ||
manifest of contained resources and the resources themselves. For OCRD-ZIP, the | ||
METS file is the manifest. | ||
Instead, we propose an exchange format ("OCRD-ZIP") based on the BagIt spec | ||
used for data ingestion adopted in the web archiving community. | ||
|
||
## BagIt profile | ||
|
||
As a baseline, an OCRD-ZIP must adhere to [v0.97+ of the BagIt | ||
specs](https://tools.ietf.org/html/draft-kunze-bagit-16), i.e. | ||
|
||
* all files in `data/` | ||
* a file `bagit.txt` | ||
* a file `bag-info.txt` | ||
|
||
In accordance with the BagIt standard, `bagit.txt` MUST consist of exactly | ||
these two lines: | ||
|
||
``` | ||
BagIt-Version: 1.0 | ||
Tag-File-Character-Encoding: UTF-8 | ||
``` | ||
|
||
In addition, OCRD-ZIP adhere to a [BagIt | ||
profile](https://github.com/bagit-profiles/bagit-profiles) (see [Appendix A for | ||
the full definition](#appendix-a)): | ||
|
||
* `bag-info.txt` MUST additionally contain these tags: | ||
* [`Ocrd-Identifier`](#ocrd-identifier): A globally unique identifier for this bag | ||
* [`Ocrd-Base-Version-Checksum`](#ocrd-base-version-checksum): Checksum of the version this bag is based on | ||
* `bag-info.txt` MAY additionally contain these tags: | ||
* [`Ocrd-Mets`](#ocrd-mets): Alternative path to the mets.xml file if its path IS NOT `/data/mets.xml` | ||
* [`Ocrd-Manifestation-Depth`](#ocrd-manifestation-depth): Whether all URL are dereferenced as files or only some | ||
|
||
### `Ocrd-Mets` | ||
|
||
By default, the METS file should be at `data/mets.xml`. If this file has | ||
another name, it must be listed here and implementations MUST check for | ||
`Ocrd-Mets` before assuming `data/mets.xml`. | ||
|
||
### `Ocrd-Manifestation-Depth` | ||
|
||
Specifiy whether the bag contains the full manifestation of the data referenced in the METS (`full`) | ||
or only those files that were `file://` URLs before (`partial`). Default: `partial`. | ||
|
||
## Format | ||
### `Ocrd-Identifier` | ||
|
||
A globally unique identifier identifying the work/works/parts of works this | ||
bundle of file represents. | ||
|
||
This is to be used for repositories to identify new ingestions of existing works. | ||
|
||
To ensure global uniqueness, the identifier should be prefixed with an | ||
identifier of the organization, e.g. an ISIL or domain name. | ||
|
||
### `Ocrd-Base-Version-Checksum` | ||
|
||
The SHA512 checksum of the `manifest-sha512.txt` file of the version this bag | ||
was based on, if any. | ||
|
||
## Invariants | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not using
As There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
If you mean the mechanism discussed in OCR-D/core#176 ( |
||
### ZIP | ||
|
||
An OCRD-ZIP MUST be a valid ZIP file. | ||
An OCRD-ZIP MUST be a serialized as a ZIP file. | ||
|
||
### `mets.xml` in the root folder | ||
### `manifest-sha512.txt` | ||
|
||
The root folder of the ZIP filetree must contain a file `mets.xml`. | ||
Checksums for the files in `/data` must be calculated with the `SHA512` | ||
algorithm only and provided as `manifest-sha512.txt`. | ||
|
||
Since the checksum of this manifest file can be relevant (see | ||
[`Ocrd-Base-Version-Checksum`](#ocrd-base-version-checksum)), in addition to the requirements | ||
of the BagIt spec, the entries MUST be sorted. | ||
|
||
**NOTE:** These checksums can be generated with `find data -type f | sort -sf |xargs sha512sum > manifest-sha512.txt`. | ||
|
||
### `file://`-URLs must be relative | ||
|
||
All resources referenced in the METS with a `file://`-URL (and consequently all | ||
those referenced in other files within the workspace -- see rule "When in PAGE | ||
then in METS") must be referenced by `file://`-URL that must be relative to the | ||
root location of the workspace. | ||
then in METS") must be referenced by `file://`-URL that is absolute with root | ||
being the root location of the workspace, i.e. they MUST begin with | ||
`file:///data` | ||
|
||
Right: | ||
* `file://foo.xml` | ||
* `file://foo.tif` | ||
* `http://server/foo.tif` | ||
* `file:///data/foo.xml` | ||
* `file:///data/foo.tif` | ||
* `http:///data/server/foo.tif` | ||
|
||
Wrong: | ||
* `file:///absolute/path/somewhere/foo.tif` | ||
|
||
### When in ZIP then in METS | ||
### When in data then in METS | ||
|
||
All files except `mets.xml` itself that are contained in `data` directory must | ||
be referenced in a `mets:file/mets:Flocat` in the `mets.xml`. | ||
|
||
All files except `mets.xml` itself that are contained in the OCRD-ZIP must be | ||
referenced in a `file/Flocat` in the `mets.xml`. | ||
## Algorithms | ||
|
||
## Packing a workspace as OCRD-ZIP | ||
### Packing a workspace as OCRD-ZIP | ||
|
||
To pack a workspace to OCRD-ZIP: | ||
|
||
* Create a temporary folder `TMP` | ||
* Copy source METS to `TMP/mets.xml` | ||
* Foreach file `f` in `TMP/mets.xml`: | ||
* If it is not a `file://`-URL, continue | ||
* Copy the file to a location `TMP`. The structure SHOULD be `<USE>/<ID>` where | ||
* Copy mets.xml to `TMP/data/mets.xml` | ||
* Foreach `mets:file` `f` in `TMP/data/mets.xml`: | ||
* If it is not a `file://`-URL | ||
* If `Ocrd-Manifestation-Depth` is `partial` | ||
continue | ||
* Download/Copy the file to a location within `TMP/data`. The structure SHOULD be `<USE>/<ID>` where | ||
* `<USE>` is the `USE` attribute of the parent `mets:fileGrp` | ||
* `<ID>` is the `ID` attribute of the `mets:file` | ||
* Replace the URL of `f` with `file://<USE>/<ID>` in | ||
* all `mets:FLocat` of `TMP/mets.xml` | ||
* all other files in the workspace | ||
* zip the directory with the `zip` utility | ||
* Replace the URL of `f` with `file:///data/<USE>/<ID>` in | ||
* all `mets:FLocat` of `TMP/data/mets.xml` | ||
* all other files in the workspace, esp. PAGE-XML | ||
* Package `TMP` as a BagIt bag | ||
|
||
## Unpacking OCRD-ZIP to a workspace | ||
### Unpacking OCRD-ZIP to a workspace | ||
|
||
* Unzip OCRD-ZIP `z` to a folder `TMP` (e.g. `/tmp/folder-1`) | ||
* Foreach file `f` in `TMP/mets.xml`: | ||
* Unzip OCRD-ZIP `z` to a folder `TMP` | ||
* Foreach file `f` in `TMP/data/mets.xml`: | ||
* If it is not a `file://`-URL, continue | ||
* Replace the URL of `f` with `file://<ABSPATH>`, where `<ABSPATH>` is the absolute path to `f`, in | ||
* `TMP/mets.xml | ||
* all files within `TMP` | ||
|
||
## IANA considerations | ||
* `TMP/data/mets.xml` | ||
* all files within `TMP`, esp. PAGE-XML | ||
|
||
## Appendix A - BagIt profile definition | ||
|
||
<!-- BEGIN-EVAL -w '```yaml' '```' -- cat ./bagit-profile.yml --> | ||
```yaml | ||
Bagit-Profile-Info: | ||
Bagit-Profile-Identifier: https://ocr-d.github.io/bagit-profile.json | ||
Source-Organization: OCR-D | ||
External-Description: BagIt profile for OCR data | ||
Version: 0.1 | ||
Bag-Info: | ||
Bagging-Date: | ||
required: false | ||
Source-Organization: | ||
required: false | ||
Ocrd-Mets: | ||
required: false | ||
default: 'data/mets.xml' | ||
Ocrd-Manifestation-Depth: | ||
required: false | ||
default: partial | ||
values: ["partial", "full"] | ||
Ocrd-Identifier: | ||
required: true | ||
Ocrd-Checksum: | ||
required: false | ||
# echo -n | sha512sum | ||
default: 'cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e' | ||
Manifests-Required: | ||
- sha512 | ||
Allow-Fetch.txt: false | ||
Serialization: required | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See comments in bagit_ocrd_profile.yml. |
||
Accept-Serialization: application/zip | ||
Accept-BagIt-Version: | ||
- 1.0 | ||
``` | ||
|
||
<!-- END-EVAL --> | ||
|
||
## Appendix B - IANA considerations | ||
|
||
Proposed media type of OCRD-ZIP: `application/vnd.ocrd+zip` | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
X-Ocrd-Manifestation-Depth
Specifiy whether the bag contains the full manifestation of the data referenced in the METS (
full
)or only those files that were
file://
URLs before (partial
). In case ofdiff
X-Ocrd-Identifier
andX-Ocrd-Version
have to be defined as base. For safety reasons there may be also a checksumX-Ocrd-Md5
of the base file. Thediff
attribute may be used for ingest new versions of an existing document into the LTA.X-Ocrd-Identifier
A unique identifier is required for the LTA. This should be fetched from mets.xml.
X-Ocrd-Version
Positive Integer holding version number of the base. Version number will be incremented during ingest into LTA.
X-Ocrd-Md5
Checksum of file
manifest-md5.txt
found intagmanifest-md5.txt
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ocrd-Checksum
Checksum of file manifest-sha512.txt