22
22
import org .culturegraph .mf .framework .annotations .Out ;
23
23
import org .culturegraph .mf .util .StringUtil ;
24
24
25
-
26
25
/**
27
- * <p>Parses pica+ records. The parser only parses single records.
28
- * A string containing multiple records must be split into
29
- * individual records before passing it to {@code PicaDecoder}.</p>
26
+ * Parses pica+ records. The parser only parses single records. A string
27
+ * containing multiple records must be split into individual records before
28
+ * passing it to {@code PicaDecoder}.
29
+ * <p>
30
+ * The parser is designed to accept any string as valid input and to parse pica
31
+ * plain format as well as normalised pica. To achieve this, the parser behaves
32
+ * as following:
33
+ * <ul>
34
+ * <li>The parser assumes that the input starts with a field name.
30
35
*
31
- * <p>The parser is designed to accept any string as valid input and
32
- * to parse pica plain format as well as normalised pica. To
33
- * achieve this, the parser behaves as following:</p>
36
+ * <li>The field name and the first subfield are separated by a subfield
37
+ * marker (\u001f).
34
38
*
35
- * <ul>
36
- * <li>Fields are separated by record markers (0x1d), field
37
- * markers (0x1e) or field end markers (0x0a).</li>
38
- * <li>The field name and the first subfield are separated by
39
- * a subfield marker (0x01f).</li>
40
- * <li>The parser assumes that the input starts with a field
41
- * name.</li>
42
- * <li>The parser assumes that the end of the input marks
43
- * the end of the current field and the end of the record.
44
- * </li>
45
- * <li>Subfields are separated by subfield markers (0x1f).</li>
46
- * <li>The first character of a subfield is the name of the
47
- * subfield</li>
48
- * <li>To handle input with multiple field and subfield separators
49
- * following each other directly (for instance 0x0a and 0x1e), it
50
- * is assumed that field names, subfields, subfield names or
51
- * subfield values can be empty.</li>
52
- * </ul>
39
+ * <li>Fields are separated by record markers (\u001d), field
40
+ * markers (\u001e) or field end markers (\u000a).
53
41
*
54
- * <p>Please note that the record markers is treated as a field
55
- * delimiter and not as a record delimiter. Records need to be
56
- * separated prior to parsing them.</p>
42
+ * <li>Subfields are separated by subfield markers (\u001f).
57
43
*
58
- * <p>As the behaviour of the parser may result in unnamed fields or
59
- * subfields or fields with no subfields the {@code PicaDecoder}
60
- * automatically filters empty fields and subfields:</p>
44
+ * <li>The first character of a subfield is the name of the subfield
61
45
*
62
- * <ul>
63
- * <li>Subfields without a name are ignored (such fields cannot
64
- * have any value because then the first character of the value
65
- * would be the field name).</li>
66
- * <li>Subfields which only have a name but no value are always
67
- * parsed.</li>
68
- * <li>Unnamed Fields are only parsed if the contain not-ignored
69
- * subfields.</li>
70
- * <li>Named fields containing none or only ignored subfields are
71
- * only parsed if {@code skipEmptyFields} is set to {@code false}
72
- * otherwise they are ignored.</li>
73
- * <li>Input containing only whitespace (spaces and tabs) is
74
- * completely ignored</li>
46
+ * <li>The parser assumes that the end of the input marks the end of the
47
+ * current field and the end of the record.
48
+ *
49
+ * <li>To handle input with multiple field and subfield separators following
50
+ * each other directly (for instance \u000a and \u001e), it is assumed
51
+ * that field names, subfields, subfield names or subfield values can be
52
+ * empty.
75
53
* </ul>
54
+ * Please note that the record marker is treated as a field delimiter and not
55
+ * as a record delimiter. Records need to be separated prior to parsing them.
56
+ * <p>
57
+ * As the behaviour of the parser may result in unnamed fields or subfields or
58
+ * fields with no subfields the {@code PicaDecoder} automatically filters empty
59
+ * fields and subfields:
60
+ * <ul>
61
+ * <li>Subfields without a name are ignored (such subfields cannot have any
62
+ * value because then the first character of the value would be the name of
63
+ * the subfield).
76
64
*
77
- * <p>The {@code PicaDecoder} calls {@code receiver.startEntity} and
78
- * {@code receiver.endEntity} for each parsed field and
79
- * {@code receiver.literal} for each parsed subfield. Spaces in the
80
- * field name are not included in the entity name. The input
81
- * "028A \x1faAndy\x1fdWarhol\x1e" would produce the following
82
- * sequence of calls:</p>
65
+ * <li>Subfields which only have a name but no value are always parsed.
83
66
*
84
- * <ol>
85
- * <li>receiver.startEntity("028A")</li>
86
- * <li>receiver.literal("a", "Andy")</li>
87
- * <li>receiver.literal("d", "Warhol")</li>
88
- * <li>receiver.endEntity()</li>
89
- * </ol>
67
+ * <li>Unnamed fields are only parsed if the contain not-ignored subfields.
90
68
*
91
- * <p>The content of subfield 003@$0 is used for the record id. If
92
- * {@code ignoreMissingIdn } is false and field 003@$0 is not found
93
- * in the record a {@link MissingIdException} is thrown.</p>
69
+ * <li>Named fields containing none or only ignored subfields are only parsed
70
+ * if {@link #setSkipEmptyFields(boolean) } is set to false otherwise they are
71
+ * ignored.
94
72
*
95
- * <p>The parser assumes that the input is utf-8 encoded. The parser
96
- * does not support other pica encodings.</p>
73
+ * <li>Input containing only whitespace (spaces and tabs) is completely
74
+ * ignored.
75
+ * </ul>
76
+ * The {@code PicaDecoder} emits <i>start-entity</i> and <i>end-entity</i>
77
+ * events for each parsed field and <i>literal</i> events for each parsed
78
+ * subfield. Field names are trimmed by default (leading and trailing whitespace
79
+ * is removed). This can be changed by setting
80
+ * {@link #setTrimFieldNames(boolean)} to false.
81
+ * <p>
82
+ * The content of subfield <i>003@ $0</i> is used as record id. If
83
+ * {@link #setIgnoreMissingIdn(boolean)} is false and field
84
+ * <i>003@ $0</i> is not found in the record a
85
+ * {@link MissingIdException} is thrown otherwise the record identifier is an
86
+ * empty string.
87
+ * <p>
88
+ * For example, when run on the input
89
+ * <pre>
90
+ * 003@ \u001f01234\u001e
91
+ * 028A \u001faAndy\u001fdWarhol\u001e
92
+ * </pre>
93
+ *
94
+ * the {@code PicaDecoder} will produce the following sequence of events:
95
+ * <pre>{@literal
96
+ * start-record "1234"
97
+ * start-entity "003@"
98
+ * literal "0": 1234
99
+ * end-entity
100
+ * start-entity "028A"
101
+ * literal "a": Andy
102
+ * literal "d": Warhol
103
+ * end-entity
104
+ * end-record
105
+ * }</pre>
106
+ *
107
+ * The parser assumes that the input is utf-8 encoded. The parser does not
108
+ * support other pica encodings.
97
109
*
98
110
* @author Christoph Böhme
99
111
*
@@ -118,6 +130,21 @@ public final class PicaDecoder
118
130
119
131
private boolean ignoreMissingIdn ;
120
132
133
+ /**
134
+ * Controls whether records having no pica subfield <i>003@ $0</i>
135
+ * (which contains the record identifier <i>IDN</i>) are reported as faulty.
136
+ * By default such records are reported by the {@code PicaDecoder} by throwing
137
+ * a {@link MissingIdException}.
138
+ * <p>
139
+ * The setting can be changed at any time. It becomes effective with the next
140
+ * record that is being processed.
141
+ * <p>
142
+ * <strong>Default value: {@code false}</strong>
143
+ *
144
+ * @param ignoreMissingIdn if true, missing IDNs do not trigger a
145
+ * {@link MissingIdException} but an empty string is
146
+ * used as record identifier instead.
147
+ */
121
148
public void setIgnoreMissingIdn (final boolean ignoreMissingIdn ) {
122
149
this .ignoreMissingIdn = ignoreMissingIdn ;
123
150
}
@@ -126,6 +153,20 @@ public boolean getIgnoreMissingIdn() {
126
153
return ignoreMissingIdn ;
127
154
}
128
155
156
+ /**
157
+ * Controls whether decomposed unicode characters in field values are
158
+ * normalised to their precomposed version. By default no normalisation is
159
+ * applied. The normalisation is only applied to values not to field or
160
+ * subfield names.
161
+ * <p>
162
+ * The setting can be changed at any time. It becomes effective with the next
163
+ * record that is being processed.
164
+ * <p>
165
+ * <strong>Default value: {@code false}</strong>
166
+ *
167
+ * @param normalizeUTF8 if true, decomposed unicode characters in values are
168
+ * normalised to their precomposed version.
169
+ */
129
170
public void setNormalizeUTF8 (final boolean normalizeUTF8 ) {
130
171
parserContext .setNormalizeUTF8 (normalizeUTF8 );
131
172
}
@@ -134,6 +175,17 @@ public boolean getNormalizeUTF8() {
134
175
return parserContext .getNormalizeUTF8 ();
135
176
}
136
177
178
+ /**
179
+ * Controls whether fields without subfields are skipped and no events are
180
+ * emitted for them. By default empty fields are skipped.
181
+ * <p>
182
+ * The setting can be changed at any time. It becomes effective with the next
183
+ * record that is being processed.
184
+ * <p>
185
+ * <strong>Default value: {@code true}</strong>
186
+ *
187
+ * @param skipEmptyFields if true, then empty fields are skipped.
188
+ */
137
189
public void setSkipEmptyFields (final boolean skipEmptyFields ) {
138
190
parserContext .setSkipEmptyFields (skipEmptyFields );
139
191
}
@@ -142,6 +194,24 @@ public boolean getSkipEmptyFields() {
142
194
return parserContext .getSkipEmptyFields ();
143
195
}
144
196
197
+ /**
198
+ * Sets whether field names are trimmed (removal of leading and trailing
199
+ * whitespace). By default field names are trimmed.
200
+ * <p>
201
+ * The setting can be changed at any time. It becomes effective with the next
202
+ * record that is being processed.
203
+ * <p>
204
+ * <strong>Default value: {@code true}</strong>
205
+ *
206
+ * @param trimFieldNames if true, then field names are trimmed.
207
+ */
208
+ public void setTrimFieldNames (final boolean trimFieldNames ) {
209
+ parserContext .setTrimFieldNames (trimFieldNames );
210
+ }
211
+
212
+ public boolean getTrimFieldNames () {
213
+ return parserContext .getTrimFieldNames ();
214
+ }
145
215
@ Override
146
216
public void process (final String record ) {
147
217
assert !isClosed ();
0 commit comments