Skip to content

Commit 1b21fb5

Browse files
committed
Add data-validation support to fill script
1 parent 5d4d0e5 commit 1b21fb5

2 files changed

Lines changed: 47 additions & 1 deletion

File tree

run/_common

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ JENA_HOME="$HOME/.local/opt/apache-jena-$JENA_VERSION"
3737
# For the (SPARQL) query web interface to the DB
3838
JENA_FUSEKI_HOME="$HOME/.local/opt/apache-jena-fuseki-$JENA_VERSION"
3939

40+
jena_db_data_validator="$JENA_HOME/bin/riot"
4041
jena_db_data_injector="$JENA_HOME/bin/tdb2.tdbloader"
4142
jena_db_data_querier="$JENA_HOME/bin/tdb2.tdbquery"
4243
jena_fuseki_server="$JENA_FUSEKI_HOME/fuseki-server"

run/fill-db

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ data_branch="main"
2929
cleanup=false
3030
online=true
3131
batch_size="$BATCH_SIZE_DEFAULT"
32+
validate=false
33+
validate_only=false
34+
validation_output_file="$build_dir/ttl_validation_errors.txt"
3235

3336
function print_help() {
3437

@@ -39,6 +42,12 @@ function print_help() {
3942
echo "Options:"
4043
echo " -h, --help"
4144
echo " Print this usage help and exit"
45+
echo " --validate"
46+
echo " Validate the *.ttl files before loading them into the DB"
47+
echo " (output: '$validation_output_file')"
48+
echo " --validate-only"
49+
echo " Validate the *.ttl files and exit"
50+
echo " (output: '$validation_output_file')"
4251
echo " --batch-size <NUMBER>"
4352
echo " The number of Turtle fiels to load into the DB at once [default: $BATCH_SIZE_DEFAULT]"
4453
echo " --offline"
@@ -60,6 +69,7 @@ function print_help() {
6069
echo " $script_name --custom-data \"$CLONE_URL_OLD_DATA\" main"
6170
echo " $script_name --local-data /data --batch-size 1000"
6271
echo " $script_name --samples"
72+
echo " $script_name --validate --samples"
6373
}
6474

6575
# read command-line args
@@ -74,6 +84,13 @@ do
7484
print_help
7585
exit 0
7686
;;
87+
--validate)
88+
validate=true
89+
;;
90+
--validate-only)
91+
validate=true
92+
validate_only=true
93+
;;
7794
--batch-size)
7895
batch_size="$1"
7996
shift
@@ -200,6 +217,32 @@ find \
200217
num_ttls="$(cat "$ttl_list_file" | wc -l)"
201218
echo "# *.ttl files: $num_ttls"
202219

220+
if $validate
221+
then
222+
echo
223+
echo "Validating *.ttl files syntax ..."
224+
rm -Rf "$validation_output_file"
225+
time while mapfile -t -n "$batch_size" batch && ((${#batch[@]}))
226+
do
227+
"$jena_db_data_validator" \
228+
--validate \
229+
"${batch[@]}" \
230+
|| true 2>&1 \
231+
| grep -v INFO -B 1 \
232+
| tee --append "$validation_output_file"
233+
done < "$ttl_list_file"
234+
echo "Validation results stored in file '$validation_output_file'."
235+
lines_syntax_violations="$(cat "$validation_output_file" | wc -l)"
236+
num_syntax_violations="$((lines_syntax_violations / 2))"
237+
echo "# of Turtle syntax violations: $num_syntax_violations"
238+
if $validate_only
239+
then
240+
echo
241+
echo "done."
242+
exit 0
243+
fi
244+
fi
245+
203246
rm -Rf "$db_tmp_dir"
204247
mkdir -p "$db_tmp_dir"
205248

@@ -213,7 +256,7 @@ do
213256
"${batch[@]}"
214257
# --loader=parallel \
215258
done < "$ttl_list_file" 2>&1 | awk -v ORS='' \
216-
'
259+
'
217260
# This AWK script only serves to reformat the output
218261
# of the loading procedure into a less spammy
219262
# and more informative one.
@@ -269,3 +312,5 @@ echo " --query \"$res_dir/sample-query.txt\""
269312
echo
270313
echo "Or start the Web interface for running SPARQL queries with:"
271314
echo "run/web-ui"
315+
echo
316+
echo "done."

0 commit comments

Comments
 (0)