Skip to content

Commit ee177cf

Browse files
committed
Add yapremisrw, Yet Another PREMIS reader/writer
Added yapremisrw, yet another PREMIS reader/writer plugin. This was based on previous work by mcantelon. It has been rebased against current master and modified minimally to make it a dependency (plugin) injectable into `metsrw.fsentry.FSEntry`.
1 parent ea868f3 commit ee177cf

29 files changed

+9766
-15
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ pip-delete-this-directory.txt
3232
.tox/
3333
.coverage
3434
.cache
35+
htmlcov
3536
nosetests.xml
3637
coverage.xml
3738

docs/examples.rst

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
Example usage
2+
=============
3+
4+
Parsing METS documents
5+
----------------------
6+
7+
Example of listing the relative file paths of preservation files referenced in
8+
a METS file:::
9+
10+
import metsrw
11+
12+
mets = metsrw.METSDocument.fromfile('fixtures/complete_mets_2.xml')
13+
for entry in mets.all_files():
14+
if entry.use == 'preservation':
15+
print entry.path
16+
17+
Example of retrieving a file by UUID:::
18+
19+
import metsrw
20+
21+
mets = metsrw.METSDocument.fromfile('fixtures/complete_mets_2.xml')
22+
entry = mets.get_file('46b7cb96-792c-4441-a5d6-67c83313501c')
23+
print entry.path
24+
25+
Creating/modifying METS documents
26+
---------------------------------
27+
28+
Example creation of a METS document (without PREMIS or Dublin Core metadata):::
29+
30+
import metsrw
31+
import uuid
32+
33+
mw = metsrw.METSDocument()
34+
35+
# Create object entries
36+
file1 = metsrw.FSEntry('objects/cat.png', file_uuid=str(uuid.uuid4()))
37+
file2 = metsrw.FSEntry('objects/dog.jpg', file_uuid=str(uuid.uuid4()))
38+
39+
# Create preservation derivative entries
40+
file1p = metsrw.FSEntry('objects/cat-preservation.tiff', use='preservation', file_uuid=str(uuid.uuid4()), derived_from=file1)
41+
file2p = metsrw.FSEntry('objects/dog-preservation.tiff', use='preservation', file_uuid=str(uuid.uuid4()), derived_from=file2)
42+
43+
# Create object directory entry
44+
objects = metsrw.FSEntry('objects', type='Directory', children=[file1, file2, file1p, file2p])
45+
46+
# Create metadata subdirectories then metadata directory entry
47+
children = [
48+
metsrw.FSEntry('transfers', type='Directory', children=[]),
49+
metsrw.FSEntry('metadata/metadata.csv', use='metadata', file_uuid=str(uuid.uuid4())),
50+
]
51+
metadata = metsrw.FSEntry('metadata', type='Directory', children=children)
52+
53+
# Create submission METS entry and submission documentation parent directory entry
54+
children = [
55+
metsrw.FSEntry('submissionDocumentation/METS.xml', use='submissionDocumentation', file_uuid=str(uuid.uuid4())),
56+
]
57+
sub_doc = metsrw.FSEntry('submissionDocumentation', type='Directory', children=children)
58+
59+
# Create SIP entry containing objects, metadata, and submission documentaton entries
60+
children = [objects, metadata, sub_doc]
61+
sip = metsrw.FSEntry('sipname-uuid', type='Directory', children=children)
62+
63+
# Add SIP entry to METS document and write to file
64+
mw.append_file(sip)
65+
mw.write('mets.xml', fully_qualified=True, pretty_print=True)

fixtures/complete_mets_2.xml

Lines changed: 7541 additions & 0 deletions
Large diffs are not rendered by default.

metsrw/__init__.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
AM_PNTR_SCT_PATH,
2121
get_schematron,
2222
validate,
23+
get_file_path,
2324
get_xmlschema,
2425
xsd_validate,
2526
schematron_validate,
@@ -46,8 +47,9 @@
4647
'MDRef', 'MDWrap', 'METSDocument', 'NAMESPACES', 'SCHEMA_LOCATIONS',
4748
'lxmlns', 'FILE_ID_PREFIX', 'GROUP_ID_PREFIX', 'METS_XSD_PATH',
4849
'AM_SCT_PATH', 'AM_PNTR_SCT_PATH', 'get_schematron', 'validate',
49-
'get_xmlschema', 'xsd_validate', 'schematron_validate',
50-
'sct_report_string', 'xsd_error_log_string', 'report_string',
51-
'FeatureBroker', 'set_feature_broker_to_default_state',
52-
'feature_broker', 'Dependency', 'has_class_methods', 'has_methods',
53-
'is_class', 'plugins', '__version__']
50+
'get_file_path', 'get_xmlschema', 'xsd_validate',
51+
'schematron_validate', 'sct_report_string', 'xsd_error_log_string',
52+
'report_string', 'FeatureBroker',
53+
'set_feature_broker_to_default_state', 'feature_broker',
54+
'Dependency', 'has_class_methods', 'has_methods', 'is_class',
55+
'plugins', '__version__']

metsrw/di.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,10 @@
1919
See http://code.activestate.com/recipes/413268/
2020
"""
2121

22-
from .plugins import premisrw
22+
from .plugins import (
23+
premisrw,
24+
dcrw
25+
)
2326

2427

2528
class FeatureBroker(object):
@@ -62,10 +65,21 @@ def __getitem__(self, feature_name):
6265

6366

6467
def set_feature_broker_to_default_state(fb):
68+
"""Provide dependencies via the global singleton feature broker.
69+
70+
To use yapremisrw, provide different class(es) from that plugin, e.g., to
71+
use ``yapremisrw.Event``::
72+
73+
>>> from .plugins import yapremisrw
74+
>>> from metsrw import feature_broker as fb
75+
>>> fb.provide('premis_event_class', yapremisrw.Event)
76+
77+
"""
6578
fb.clear()
6679
fb.provide('premis_object_class', premisrw.PREMISObject)
6780
fb.provide('premis_event_class', premisrw.PREMISEvent)
6881
fb.provide('premis_agent_class', premisrw.PREMISAgent)
82+
fb.provide('dublin_core_class', dcrw.DublinCoreXmlData)
6983

7084

7185
feature_broker = FeatureBroker() # global singleton feature broker

metsrw/fsentry.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,16 @@ class FSEntry(object):
8989
has_methods('serialize'),
9090
has_class_methods('fromtree'),
9191
is_class)
92+
dublin_core_class = Dependency(
93+
'dublin_core_class',
94+
has_methods('serialize'),
95+
has_class_methods('fromtree'),
96+
is_class)
9297

9398
PREMIS_OBJECT = 'PREMIS:OBJECT'
9499
PREMIS_EVENT = 'PREMIS:EVENT'
95100
PREMIS_AGENT = 'PREMIS:AGENT'
101+
DublinCore = 'DC'
96102

97103
def __init__(self, path=None, label=None, use='original', type=u'Item',
98104
children=None, file_uuid=None, derived_from=None,
@@ -131,6 +137,11 @@ def __init__(self, path=None, label=None, use='original', type=u'Item',
131137
self.amdsecs = []
132138
self.dmdsecs = []
133139

140+
# Convenient access to metadata (without cycling through amdsecs)
141+
self.techmds = []
142+
self.digiprovmds = []
143+
self.rightsmds = []
144+
134145
def __str__(self):
135146
return '{s.type}: {s.path}'.format(s=self)
136147

@@ -271,7 +282,9 @@ def add_premis_rights(self, md, mode='mdwrap'):
271282

272283
def add_dublin_core(self, md, mode='mdwrap'):
273284
# TODO add extra args and create DC object here
274-
return self.add_dmdsec(md, 'DC', mode)
285+
return self.add_dmdsec(
286+
self.serialize_md_inst(md, self.dublin_core_class),
287+
self.DublinCore, mode)
275288

276289
def add_child(self, child):
277290
"""Add a child FSEntry to this FSEntry.
@@ -414,3 +427,7 @@ def get_premis_events(self):
414427
def get_premis_agents(self):
415428
return self.get_subsections_of_type(
416429
self.PREMIS_AGENT, self.premis_agent_class)
430+
431+
def get_dublin_core(self):
432+
return self.get_subsections_of_type(
433+
self.DublinCore, self.dublin_core_class)

metsrw/metadata.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from . import exceptions
1313
from . import utils
14+
from .plugins.dcrw import DublinCoreXmlData
1415

1516
LOGGER = logging.getLogger(__name__)
1617

@@ -298,14 +299,18 @@ class MDWrap(object):
298299
include "PREMIS:OBJECT", "PREMIS:EVENT,", "DC" and "OTHER".
299300
:param str othermdtype: The OTHERMDTYPE of the XML document. Should be set if mdtype is "OTHER".
300301
"""
301-
def __init__(self, document, mdtype, othermdtype=None):
302+
303+
MDTYPE_CLASSES = {'DC': DublinCoreXmlData}
304+
305+
def __init__(self, document, mdtype, othermdtype=None, data=None):
302306
parser = etree.XMLParser(remove_blank_text=True)
303307
if isinstance(document, six.string_types):
304308
self.document = etree.fromstring(document, parser=parser)
305309
elif isinstance(document, (etree._Element, list)):
306310
self.document = document
307311
self.mdtype = mdtype
308312
self.othermdtype = othermdtype
313+
self.data = data
309314

310315
@classmethod
311316
def parse(cls, root):
@@ -321,6 +326,12 @@ def parse(cls, root):
321326
mdtype = root.get('MDTYPE')
322327
if not mdtype:
323328
raise exceptions.ParseError('mdWrap must have a MDTYPE')
329+
if mdtype in MDWrap.MDTYPE_CLASSES.keys():
330+
mdtype_class = MDWrap.MDTYPE_CLASSES[mdtype]()
331+
data = mdtype_class.parse(root.find('mets:xmlData', namespaces=utils.NAMESPACES)).__dict__
332+
else:
333+
data = None
334+
324335
othermdtype = root.get('OTHERMDTYPE')
325336
document = root.xpath('mets:xmlData/*', namespaces=utils.NAMESPACES)
326337
if len(document) == 0:
@@ -329,7 +340,7 @@ def parse(cls, root):
329340
' one has none')
330341
elif len(document) == 1:
331342
document = document[0]
332-
return cls(document, mdtype, othermdtype)
343+
return cls(document, mdtype, othermdtype=othermdtype, data=data)
333344

334345
def serialize(self):
335346
el = etree.Element(utils.lxmlns('mets') + 'mdWrap', MDTYPE=self.mdtype)

metsrw/mets.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,11 @@ def _add_amdsecs_to_fs_entry(amdids, fs_entry, tree):
389389
namespaces=utils.NAMESPACES)
390390
amdsec = metadata.AMDSec.parse(amdsec_elem)
391391
fs_entry.amdsecs.append(amdsec)
392+
# Add subsections to convience properties
393+
for subsection in amdsec.subsections:
394+
getattr(
395+
fs_entry, subsection.subsection.lower() + 's').append(
396+
subsection)
392397

393398
def _parse_tree(self, tree=None):
394399
if tree is None:

metsrw/plugins/dcrw/__init__.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from __future__ import absolute_import
2+
3+
import logging
4+
5+
from .dc import DublinCoreXmlData
6+
from .utils import (
7+
NAMESPACES,
8+
DUBLINCORE_SCHEMA_LOCATIONS,
9+
lxmlns,
10+
)
11+
from .exceptions import (
12+
DcError,
13+
ConstructError,
14+
ParseError
15+
)
16+
17+
18+
LOGGER = logging.getLogger(__name__)
19+
LOGGER.addHandler(logging.NullHandler())
20+
21+
22+
__all__ = [
23+
'DublinCoreXmlData',
24+
'NAMESPACES',
25+
'DUBLINCORE_SCHEMA_LOCATIONS',
26+
'lxmlns',
27+
'DcError',
28+
'ConstructError',
29+
'ParseError',
30+
]

metsrw/plugins/dcrw/dc.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
from __future__ import absolute_import
2+
3+
from collections import OrderedDict
4+
import logging
5+
from lxml import etree
6+
7+
from .exceptions import ParseError
8+
from .utils import lxmlns, NAMESPACES, DUBLINCORE_SCHEMA_LOCATIONS
9+
10+
LOGGER = logging.getLogger(__name__)
11+
12+
13+
class DublinCoreXmlData(object):
14+
"""
15+
An object representing a METS xmlData element containing a Dublin Core element.
16+
17+
:raises ParseError: If the root element tag is not xmlData.
18+
"""
19+
DC_ELEMENTS = ['title', 'creator', 'subject', 'description', 'publisher', 'contributor', 'date', 'format', 'identifier', 'source', 'relation', 'language', 'coverage', 'rights']
20+
21+
def __init__(self, title=None, creator=None, subject=None, description=None, publisher=None, contributor=None, date=None, format=None, identifier=None, source=None, relation=None, language=None, coverage=None, rights=None):
22+
for element in self.DC_ELEMENTS:
23+
setattr(self, element, locals()[element])
24+
25+
@classmethod
26+
def parse(cls, root):
27+
"""
28+
Parse an xmlData element containing a Dublin Core dublincore element.
29+
30+
:param root: Element or ElementTree to be parsed into an object.
31+
:raises ParseError: If the root is not xmlData or doesn't contain a dublincore element.
32+
"""
33+
if root.tag != lxmlns('mets') + 'xmlData':
34+
raise ParseError('DublinCoreXmlData can only parse xmlData elements with mets namespace.')
35+
36+
dc_el = root.find('dcterms:dublincore', namespaces=NAMESPACES)
37+
38+
if dc_el is None or dc_el.tag != lxmlns('dcterms') + 'dublincore':
39+
raise ParseError('xmlData can only contain a dublincore element with the dcterms namespace.')
40+
41+
args = []
42+
43+
for element in DublinCoreXmlData.DC_ELEMENTS:
44+
args.append(dc_el.findtext("dc:" + element, namespaces=NAMESPACES))
45+
46+
return cls(*args)
47+
48+
fromtree = parse
49+
50+
def serialize(self):
51+
nsmap = OrderedDict([
52+
('mets', NAMESPACES['mets']),
53+
('xsi', NAMESPACES['xsi']),
54+
('xlink', NAMESPACES['xlink'])
55+
])
56+
root = etree.Element(lxmlns('mets') + 'xmlData', nsmap=nsmap)
57+
root.append(self._serialize_dublincore())
58+
return root
59+
60+
def _serialize_dublincore(self):
61+
nsmap = OrderedDict([
62+
('dcterms', NAMESPACES['dcterms']),
63+
('dc', NAMESPACES['dc'])
64+
])
65+
attrib = {'{}schemaLocation'.format(lxmlns('xsi')): DUBLINCORE_SCHEMA_LOCATIONS}
66+
dc_root = etree.Element(lxmlns('dcterms') + 'dublincore', nsmap=nsmap, attrib=attrib)
67+
68+
for element in DublinCoreXmlData.DC_ELEMENTS:
69+
dc_el = etree.Element(lxmlns('dc') + element)
70+
dc_el.text = getattr(self, element)
71+
dc_root.append(dc_el)
72+
73+
return dc_root

0 commit comments

Comments
 (0)