@@ -29,6 +29,9 @@ data_branch="main"
2929cleanup=false
3030online=true
3131batch_size=" $BATCH_SIZE_DEFAULT "
32+ validate=false
33+ validate_only=false
34+ validation_output_file=" $build_dir /ttl_validation_errors.txt"
3235
3336function print_help() {
3437
@@ -39,6 +42,12 @@ function print_help() {
3942 echo " Options:"
4043 echo " -h, --help"
4144 echo " Print this usage help and exit"
45+ echo " --validate"
46+ echo " Validate the *.ttl files before loading them into the DB"
47+ echo " (output: '$validation_output_file ')"
48+ echo " --validate-only"
49+ echo " Validate the *.ttl files and exit"
50+ echo " (output: '$validation_output_file ')"
4251 echo " --batch-size <NUMBER>"
4352 echo " The number of Turtle fiels to load into the DB at once [default: $BATCH_SIZE_DEFAULT ]"
4453 echo " --offline"
@@ -60,6 +69,7 @@ function print_help() {
6069 echo " $script_name --custom-data \" $CLONE_URL_OLD_DATA \" main"
6170 echo " $script_name --local-data /data --batch-size 1000"
6271 echo " $script_name --samples"
72+ echo " $script_name --validate --samples"
6373}
6474
6575# read command-line args
7484 print_help
7585 exit 0
7686 ;;
87+ --validate)
88+ validate=true
89+ ;;
90+ --validate-only)
91+ validate=true
92+ validate_only=true
93+ ;;
7794 --batch-size)
7895 batch_size=" $1 "
7996 shift
@@ -200,6 +217,32 @@ find \
200217num_ttls=" $( cat " $ttl_list_file " | wc -l) "
201218echo " # *.ttl files: $num_ttls "
202219
220+ if $validate
221+ then
222+ echo
223+ echo " Validating *.ttl files syntax ..."
224+ rm -Rf " $validation_output_file "
225+ time while mapfile -t -n " $batch_size " batch && (( ${# batch[@]} ))
226+ do
227+ " $jena_db_data_validator " \
228+ --validate \
229+ " ${batch[@]} " \
230+ || true 2>&1 \
231+ | grep -v INFO -B 1 \
232+ | tee --append " $validation_output_file "
233+ done < " $ttl_list_file "
234+ echo " Validation results stored in file '$validation_output_file '."
235+ lines_syntax_violations=" $( cat " $validation_output_file " | wc -l) "
236+ num_syntax_violations=" $(( lines_syntax_violations / 2 )) "
237+ echo " # of Turtle syntax violations: $num_syntax_violations "
238+ if $validate_only
239+ then
240+ echo
241+ echo " done."
242+ exit 0
243+ fi
244+ fi
245+
203246rm -Rf " $db_tmp_dir "
204247mkdir -p " $db_tmp_dir "
205248
213256 " ${batch[@]} "
214257# --loader=parallel \
215258done < " $ttl_list_file " 2>&1 | awk -v ORS=' ' \
216- '
259+ '
217260# This AWK script only serves to reformat the output
218261# of the loading procedure into a less spammy
219262# and more informative one.
@@ -269,3 +312,5 @@ echo " --query \"$res_dir/sample-query.txt\""
269312echo
270313echo " Or start the Web interface for running SPARQL queries with:"
271314echo " run/web-ui"
315+ echo
316+ echo " done."
0 commit comments