Skip to content

Commit

Permalink
Merge pull request #197 from openpreserve/fix/prep-niggles
Browse files Browse the repository at this point in the history
FIX: Signature preparation niggles and bugs
  • Loading branch information
carlwilson authored Mar 29, 2022
2 parents 0e1ede0 + 3f03a2d commit f5941d7
Show file tree
Hide file tree
Showing 7 changed files with 159 additions and 97 deletions.
6 changes: 3 additions & 3 deletions fido/conf/fido-formats.xsd
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ Usage of DC has been based on these references:
http://dublincore.org/documents/usageguide/qualifiers.shtml
http://www.dublincore.org/documents/dc-xml-guidelines/
-->
<xs:schema elementFormDefault="qualified"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
<xs:schema elementFormDefault="qualified"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dcterms="http://purl.org/dc/terms/">
Expand Down Expand Up @@ -34,7 +34,7 @@ Usage of DC has been based on these references:
<xs:element maxOccurs="unbounded" minOccurs="0" ref="extension"/>
<xs:element maxOccurs="1" minOccurs="0" name="apple_uti" type="xs:string"/>
<xs:element maxOccurs="unbounded" minOccurs="0" ref="has_priority_over"/>
<xs:element maxOccurs="unbounded" ref="signature"/>
<xs:element maxOccurs="unbounded" minOccurs="0" ref="signature"/>
<xs:element minOccurs="0" ref="note"/>
<xs:element maxOccurs="1" ref="details"/>
</xs:sequence>
Expand Down
4 changes: 2 additions & 2 deletions fido/fido.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

from fido import __version__, CONFIG_DIR
from fido.package import OlePackage, ZipPackage
from fido.pronomutils import get_local_pronom_versions
from fido.versions import get_local_versions
from fido.char_handler import escape


Expand Down Expand Up @@ -796,7 +796,7 @@ def main(args=None):

timer = PerfTimer()

versions = get_local_pronom_versions(args.confdir)
versions = get_local_versions(args.confdir)

defaults['xml_pronomSignature'] = versions.pronom_signature
defaults['containersignature_file'] = versions.pronom_container_signature
Expand Down
22 changes: 15 additions & 7 deletions fido/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@
from six.moves import cStringIO
from six.moves.urllib.request import urlopen
from six.moves.urllib.parse import urlparse
from six.moves.urllib.error import HTTPError

from .pronomutils import get_local_pronom_versions
from .versions import get_local_versions
from .char_handler import escape


Expand Down Expand Up @@ -62,8 +63,9 @@ def prettify(elem):
class FormatInfo:
"""Convert PRONOM formats into FIDO signatures."""

def __init__(self, pronom_files, format_list=[]):
def __init__(self, pronom_files, format_list=None):
"""Instantiate class, take a list of PRONOM files and an optional list of formats."""
format_list = format_list if format_list else []
self.info = {}
self.formats = []
self.pronom_files = pronom_files
Expand Down Expand Up @@ -192,7 +194,7 @@ def parse_pronom_xml(self, source, puid_filter=None):
for id in pronom_format.findall(TNA('FileFormatIdentifier')):
type = get_text_tna(id, 'IdentifierType')
if type == 'Apple Uniform Type Identifier':
ET.SubElement(fido_format, 'apple_uid').text = get_text_tna(id, 'Identifier')
ET.SubElement(fido_format, 'apple_uti').text = get_text_tna(id, 'Identifier')
# Handle the relationships
for x in pronom_format.findall(TNA('RelatedFormat')):
rel = get_text_tna(x, 'RelationshipType')
Expand Down Expand Up @@ -275,9 +277,15 @@ def parse_pronom_xml(self, source, puid_filter=None):
ET.SubElement(rf, 'dc:identifier').text = url
# And calculate the checksum of this resource:
m = hashlib.md5()
sock = urlopen(url)
m.update(sock.read())
sock.close()
try:
sock = urlopen(url)
m.update(sock.read())
sock.close()
except HTTPError as http_excep:
sys.stderr.write('HTTP {} error loading resource {}\n'.format(http_excep.code, url))
if http_excep.code == 404:
continue

checksum = m.hexdigest()
else:
ET.SubElement(rf, 'dc:identifier').text = get_text_tna(id, 'IdentifierType') + ":" + get_text_tna(id, 'Identifier')
Expand Down Expand Up @@ -686,7 +694,7 @@ def convert_to_regex(chars, endianness='', pos='BOF', offset='0', maxoffset=''):

def run(input=None, output=None, puid=None):
"""Convert PRONOM formats into FIDO signatures."""
versions = get_local_pronom_versions()
versions = get_local_versions()

if input is None:
input = versions.get_zip_file()
Expand Down
30 changes: 30 additions & 0 deletions fido/pronom/http.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
FIDO: Format Identifier for Digital Objects.
Copyright 2010 The Open Preservation Foundation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
PRONOM format signatures HTTP calls.
"""
from six.moves import urllib


def get_sig_xml_for_puid(puid):
"""Return the full PRONOM signature XML for the passed PUID."""
req = urllib.request.Request("http://www.nationalarchives.gov.uk/pronom/{}.xml".format(puid))
response = urllib.request.urlopen(req)
xml = response.read()
return xml
4 changes: 2 additions & 2 deletions fido/toxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
import sys

from . import __version__
from .pronomutils import get_local_pronom_versions
from .versions import get_local_versions


def main():
Expand All @@ -35,7 +35,7 @@ def main():
<versions>
<fido_version>{0}</fido_version>
<signature_version>{1}</signature_version>
</versions>""".format(__version__, get_local_pronom_versions().pronom_version))
</versions>""".format(__version__, get_local_versions().pronom_version))

reader = csv.reader(sys.stdin)

Expand Down
178 changes: 101 additions & 77 deletions fido/update_signatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,11 @@
from xml.etree import ElementTree as CET
import zipfile

from six.moves.urllib.request import urlopen
from six.moves.urllib.error import URLError

from . import __version__, CONFIG_DIR, query_yes_no
from .prepare import run as prepare_pronom_to_fido
from .pronomutils import get_local_pronom_versions
from .pronom.soap import get_pronom_sig_version, get_pronom_signature
from .versions import get_local_versions
from .pronom.soap import get_pronom_sig_version, get_pronom_signature, NS
from .pronom.http import get_sig_xml_for_puid


DEFAULTS = {
Expand All @@ -56,57 +54,18 @@ def run(defaults=None):
defaults = defaults or DEFAULTS
try:
print("Contacting PRONOM...")
currentVersion = get_pronom_sig_version()
if not currentVersion:
sys.exit('Failed to obtain PRONOM signature file version number, please try again.')

print("Querying latest signaturefile version...")
signatureFile = os.path.join(CONFIG_DIR, defaults['signatureFileName'].format(currentVersion))
if os.path.isfile(signatureFile):
print("You already have the latest PRONOM signature file, version", currentVersion)
if not query_yes_no("Update anyway?"):
sys.exit('Aborting update...')

print("Downloading signature file version {}...".format(currentVersion))
currentFile, _ = get_pronom_signature()
if not currentFile:
sys.exit('Failed to obtain PRONOM signature file, please try again.')
print("Writing {0}...".format(defaults['signatureFileName'].format(currentVersion)))
with open(signatureFile, 'w') as file_:
file_.write(currentFile)

currentVersion, signatureFile = sig_version_check(defaults)
download_sig_file(defaults, currentVersion, signatureFile)
print("Extracting PRONOM PUID's from signature file...")
tree = CET.parse(signatureFile)
puids = []
for node in tree.iter("{http://www.nationalarchives.gov.uk/pronom/SignatureFile}FileFormat"):
puids.append(node.get("PUID"))
print("Found {} PRONOM PUID's".format(len(puids)))

print("Downloading signatures can take a while")
if not query_yes_no("Continue and download signatures?"):
sys.exit('Aborting update...')
tmpdir = defaults['tmp_dir']
resume_download = False
if os.path.isdir(tmpdir):
print("Found previously created temporary folder for download:", tmpdir)
resume_download = query_yes_no('Do you want to resume download (yes) or start over (no)?')
if resume_download:
print("Resuming download...")
else:
print("Creating temporary folder for download:", tmpdir)
try:
os.mkdir(tmpdir)
except OSError:
pass
if not os.path.isdir(tmpdir):
sys.stderr.write("Failed to create temporary folder for PUID's, using: " + tmpdir)

download_signatures(defaults, puids, resume_download, tmpdir)
create_zip_file(defaults, puids, currentVersion, tmpdir)
format_eles = tree.findall('.//sig:FileFormat', NS)
print("Found {} PRONOM FileFormat elements".format(len(format_eles)))
tmpdir, resume = init_sig_download(defaults)
download_signatures(defaults, format_eles, resume, tmpdir)
create_zip_file(defaults, format_eles, currentVersion, tmpdir)
if defaults['deleteTempDirectory']:
print("Deleting temporary folder and files...")
rmtree(tmpdir, ignore_errors=True)

update_versions_xml(defaults, currentVersion)

# TODO: there should be a check here to handle prepare.main exit() signal (-1/0/1/...)
Expand All @@ -118,47 +77,106 @@ def run(defaults=None):
sys.exit('Aborting update...')


def download_signatures(defaults, puids, resume_download, tmpdir):
def sig_version_check(defaults):
"""Return a tuple consisting of current sig file version and the derived file name."""
print("Contacting PRONOM...")
currentVersion = get_pronom_sig_version()
if not currentVersion:
sys.exit('Failed to obtain PRONOM signature file version number, please try again.')

print("Querying latest signaturefile version...")
signatureFile = os.path.join(CONFIG_DIR, defaults['signatureFileName'].format(currentVersion))
if os.path.isfile(signatureFile):
print("You already have the latest PRONOM signature file, version", currentVersion)
if not query_yes_no("Update anyway?"):
sys.exit('Aborting update...')
return currentVersion, signatureFile


def download_sig_file(defaults, version, signatureFile):
"""Download the latest version of the PRONOM sigs to signatureFile."""
print("Downloading signature file version {}...".format(version))
currentFile, _ = get_pronom_signature()
if not currentFile:
sys.exit('Failed to obtain PRONOM signature file, please try again.')
print("Writing {0}...".format(defaults['signatureFileName'].format(version)))
with open(signatureFile, 'w') as file_:
file_.write(currentFile)


def init_sig_download(defaults):
"""
Initialise the download of individual PRONOM signatures.
Handles user input and resumption of interupted downloads.
Return a tuple of the temp directory for writing and a boolean resume flag.
"""
print("Downloading signatures can take a while")
if not query_yes_no("Continue and download signatures?"):
sys.exit('Aborting update...')
tmpdir = defaults['tmp_dir']
resume = False
if os.path.isdir(tmpdir):
print("Found previously created temporary folder for download:", tmpdir)
resume = query_yes_no('Do you want to resume download (yes) or start over (no)?')
if resume:
print("Resuming download...")
else:
print("Creating temporary folder for download:", tmpdir)
try:
os.mkdir(tmpdir)
except OSError:
pass
if not os.path.isdir(tmpdir):
sys.stderr.write("Failed to create temporary folder for PUID's, using: " + tmpdir)
return tmpdir, resume


def download_signatures(defaults, format_eles, resume, tmpdir):
"""Download PRONOM signatures and write to individual files."""
print("Downloading signatures, one moment please...")
numberPuids = len(puids)
numberPuids = len(format_eles)
one_percent = (float(numberPuids) / 100)
numfiles = 0
for puid in puids:
puidType, puidNum = puid.split("/")
puidFileName = "puid." + puidType + "." + puidNum + ".xml"
filename = os.path.join(tmpdir, puidFileName)
if os.path.isfile(filename) and resume_download:
numfiles += 1
continue
puid_url = "http://www.nationalarchives.gov.uk/pronom/{}.xml".format(puid)
try:
filehandle = urlopen(puid_url)
except URLError as e:
sys.stderr.write("Failed to download signature file:" + puid_url)
sys.stderr.write("Error:" + str(e))
sys.exit('Please restart and resume download.')
with open(filename, 'wb') as file_:
for lines in filehandle.readlines():
file_.write(lines)
filehandle.close()
for format_ele in format_eles:
download_sig(format_ele, tmpdir, resume)
numfiles += 1
percent = int(float(numfiles) / one_percent)
print(r"{}/{} files [{}%]".format(numfiles, numberPuids, percent))
time.sleep(defaults['http_throttle'])
print("100%")


def create_zip_file(defaults, puids, currentVersion, tmpdir):
def download_sig(format_ele, tmpdir, resume):
"""
Download an individual PRONOM signature.
The signature to be downloaded is identified by the FileFormat element
parameter format_ele. The downloaded signature is written to tmpdir.
"""
puid, puidFileName = get_puid_file_name(format_ele)
filename = os.path.join(tmpdir, puidFileName)
if os.path.isfile(filename) and resume:
return
try:
xml = get_sig_xml_for_puid(puid)
except Exception as e:
sys.stderr.write("Failed to download signature file:" + puid)
sys.stderr.write("Error:" + str(e))
sys.exit('Please restart and resume download.')
with open(filename, 'wb') as file_:
file_.write(xml)


def create_zip_file(defaults, format_eles, currentVersion, tmpdir):
"""Create zip file of signatures."""
print("Creating PRONOM zip...")
compression = zipfile.ZIP_DEFLATED if 'zlib' in sys.modules else zipfile.ZIP_STORED
modes = {zipfile.ZIP_DEFLATED: 'deflated', zipfile.ZIP_STORED: 'stored'}
zf = zipfile.ZipFile(os.path.join(CONFIG_DIR, defaults['pronomZipFileName'].format(currentVersion)), mode='w')
print("Adding files with compression mode", modes[compression])
for puid in puids:
puidType, puidNum = puid.split("/")
puidFileName = "puid.{}.{}.xml".format(puidType, puidNum)
for format_ele in format_eles:
_, puidFileName = get_puid_file_name(format_ele)
filename = os.path.join(tmpdir, puidFileName)
if os.path.isfile(filename):
zf.write(filename, arcname=puidFileName, compress_type=compression)
Expand All @@ -167,10 +185,17 @@ def create_zip_file(defaults, puids, currentVersion, tmpdir):
zf.close()


def get_puid_file_name(format_ele):
"""Return a tupe of PUID and PUID file name derived from format_ele."""
puid = format_ele.get('PUID')
puidType, puidNum = puid.split("/")
return puid, 'puid.{}.{}.xml'.format(puidType, puidNum)


def update_versions_xml(defaults, currentVersion):
"""Create new versions identified sig XML file."""
print('Updating versions.xml...')
versions = get_local_pronom_versions()
versions = get_local_versions()
versions.pronom_version = str(currentVersion)
versions.pronom_signature = "formats-v" + str(currentVersion) + ".xml"
versions.pronom_container_signature = defaults['containerVersion']
Expand All @@ -188,7 +213,6 @@ def main():
args = parser.parse_args()
opts = DEFAULTS.copy()
opts.update(vars(args))

run(opts)


Expand Down
Loading

0 comments on commit f5941d7

Please sign in to comment.