This repository was archived by the owner on Jun 25, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathuhhediss.sh
More file actions
executable file
·265 lines (240 loc) · 10.2 KB
/
uhhediss.sh
File metadata and controls
executable file
·265 lines (240 loc) · 10.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
#!/bin/bash
# https://github.com/subhh/HOS-MetadataTransformations
# change directory to location of shell script
cd $(dirname $0)
# pathnames
metha_sync="/usr/sbin/metha-sync"
metha_cat="/usr/sbin/metha-cat"
recordpath=(Records Record) # metha-cat default xml path to harvested records
openrefine_server="$(readlink -f ../opt/openrefine/refine)"
openrefine_client="$(readlink -f ../opt/openrefine-client)"
data_dir="$(readlink -f ../data)"
log_dir="$(readlink -f ../log)"
# config
codename="uhhediss" # used for filename, name of OpenRefine project and value for Solr field "collectionId"
oai_url="http://ediss.sub.uni-hamburg.de/oai2/oai2.php" # base url of OAI-PMH endpoint
oai_set="" # optional: OAI-PMH set spec (e.g. institution)
oai_format="" # optional: OAI-PMH metadata format (e.g. datacite)
ram="2048M" # highest OpenRefine memory load is below 2048M
recordpath+=() # select /Records/Record/ (including /Records/Record/header)
separator="%E2%90%9F" # multiple values are separated by unicode character unit separator (U+241F)
config_dir="$(readlink -f ../cfg/${codename})" # location of OpenRefine transformation rules in json format
# help screen
function usage () {
cat <<EOF
Usage: ./${codename}.sh [-p PORT] [-s SOLRURL] [-d OPENREFINEURL]
== options ==
-p PORT PORT on which OpenRefine should run (default: 3334)
-s SOLRURL ingest data to specified Solr core
-d OPENREFINEURL ingest data to external OpenRefine service
== example ==
./${codename}.sh -p 3334 -s http://localhost:8983/solr/hos -d http://localhost:3333
EOF
exit 1
}
# defaults
port="3334"
# get user input
options="p:s:d:h"
while getopts $options opt; do
case $opt in
p ) port=${OPTARG} ;;
s ) solr_url+=("${OPTARG%/}") ;;
d ) openrefine_url=${OPTARG%/} ;;
h ) usage ;;
\? ) echo 1>&2 "Unknown option: -$OPTARG"; usage; exit 1;;
: ) echo 1>&2 "Missing option argument for -$OPTARG"; usage; exit 1;;
* ) echo 1>&2 "Unimplemented option: -$OPTARG"; usage; exit 1;;
esac
done
shift $((OPTIND - 1))
# load solr credentials from file
if [ -f "../cfg/solr/credentials" ]; then source "../cfg/solr/credentials"; fi
# declare additional variables
date=$(date +%Y%m%d_%H%M%S)
openrefine_tmp="/tmp/openrefine_${date}"
checkpoints=${#checkpointdate[@]}
checkpointdate[$((checkpoints + 1))]=$(date +%s)
checkpointname[$((checkpoints + 1))]="Start process"
memoryload=()
multivalue_config=()
external=${openrefine_url##*/}
external_host=${external%:*}
external_port=${external##*:}
if [ -n "${config_dir// }" ] ; then jsonfiles=($(find -L "${config_dir}"/*.json -type f -printf "%f\n" 2>/dev/null)) ; fi
# safe cleanup handler
cleanup()
{
echo "cleanup..."
kill -9 ${pid} &>/dev/null
rm -rf /tmp/openrefine_${date}
wait
}
trap "cleanup;exit" SIGHUP SIGINT SIGQUIT SIGTERM
# Simple Logging
exec &> >(tee -a "${log_dir}/${codename}_${date}.log")
# print variables
echo "Code name: $codename"
echo "OAI server: $oai_url"
echo "OAI set: $oai_set"
echo "OAI metadata format: $oai_format"
echo "Transformation rules: ${jsonfiles[*]}"
echo "OpenRefine heap space: $ram"
echo "OpenRefine port: $port"
echo "Solr core URL(s): ${solr_url[*]}"
echo "Solr credentials: $(if [ -n "$solr_user" ]; then echo "yes"; fi)"
echo "OpenRefine service URL: $openrefine_url"
echo "Logfile: ${codename}_${date}.log"
echo ""
# Check connection to OAI endpoint
if [[ $(curl -skL -w "%{http_code}" "${oai_url}?verb=Identify" -o /dev/null --connect-timeout 15) -ne "200" ]]; then
echo 1>&2 "no connection to OAI endpoint ${oai_url}"
exit 2
fi
# Download data via OAI with metha
checkpoints=${#checkpointdate[@]}
checkpointdate[$((checkpoints + 1))]=$(date +%s)
checkpointname[$((checkpoints + 1))]="Download via OAI with metha"
echo "=== $checkpoints. ${checkpointname[$((checkpoints + 1))]} ==="
echo ""
echo "starting time: $(date --date=@${checkpointdate[$((checkpoints + 1))]})"
echo ""
$metha_sync $(if [ -n "$oai_set" ]; then echo "-set $oai_set"; fi) $(if [ -n "$oai_format" ]; then echo "-format $oai_format"; fi) "$oai_url"
$metha_cat $(if [ -n "$oai_set" ]; then echo "-set $oai_set"; fi) $(if [ -n "$oai_format" ]; then echo "-format $oai_format"; fi) "$oai_url" > "${data_dir}/01_oai/${codename}_${date}.xml"
records_metha=$(grep -c '<Record>' "${data_dir}/01_oai/${codename}_${date}.xml")
echo "saved $records_metha records in ${data_dir}/01_oai/${codename}_${date}.xml"
echo ""
# Launch OpenRefine server
checkpoints=${#checkpointdate[@]}
checkpointdate[$((checkpoints + 1))]=$(date +%s)
checkpointname[$((checkpoints + 1))]="Launch OpenRefine server"
echo "=== $checkpoints. ${checkpointname[$((checkpoints + 1))]} ==="
echo ""
echo "starting time: $(date --date=@${checkpointdate[$((checkpoints + 1))]})"
echo ""
$openrefine_server -p ${port} -d "$openrefine_tmp" -m ${ram} &
pid=$!
until wget -q -O - http://localhost:${port} | cat | grep -q -o "OpenRefine" ; do sleep 1; done
echo ""
# Load data
checkpoints=${#checkpointdate[@]}
checkpointdate[$((checkpoints + 1))]=$(date +%s)
checkpointname[$((checkpoints + 1))]="Load data"
echo "=== $checkpoints. ${checkpointname[$((checkpoints + 1))]} ==="
echo ""
echo "starting time: $(date --date=@${checkpointdate[$((checkpoints + 1))]})"
echo ""
$openrefine_client -P ${port} --create "${data_dir}/01_oai/${codename}_${date}.xml" $(for i in ${recordpath[@]}; do echo "--recordPath=$i "; done)
echo ""
ps -o start,etime,%mem,%cpu,rss -p ${pid} --sort=start
memoryload+=($(ps --no-headers -o rss -p ${pid}))
echo ""
# Transform data
checkpoints=${#checkpointdate[@]}
checkpointdate[$((checkpoints + 1))]=$(date +%s)
checkpointname[$((checkpoints + 1))]="Transform data"
echo "=== $checkpoints. ${checkpointname[$((checkpoints + 1))]} ==="
echo ""
echo "starting time: $(date --date=@${checkpointdate[$((checkpoints + 1))]})"
echo ""
for f in "${jsonfiles[@]}" ; do
echo "transform ${f}..."
$openrefine_client -P ${port} --apply "${config_dir}/${f}" "${codename}_${date}"
ps -o start,etime,%mem,%cpu,rss -p ${pid} --sort=start
memoryload+=($(ps --no-headers -o rss -p ${pid}))
echo ""
done
echo ""
# Export data
checkpoints=${#checkpointdate[@]}
checkpointdate[$((checkpoints + 1))]=$(date +%s)
checkpointname[$((checkpoints + 1))]="Export data"
echo "=== $checkpoints. ${checkpointname[$((checkpoints + 1))]} ==="
echo ""
echo "starting time: $(date --date=@${checkpointdate[$((checkpoints + 1))]})"
echo ""
$openrefine_client -P ${port} --export --output="${data_dir}/02_transformed/${codename}_${date}.tsv" "${codename}_${date}"
echo ""
ps -o start,etime,%mem,%cpu,rss -p ${pid} --sort=start
memoryload+=($(ps --no-headers -o rss -p ${pid}))
echo ""
# Stop OpenRefine server
checkpoints=${#checkpointdate[@]}
checkpointdate[$((checkpoints + 1))]=$(date +%s)
checkpointname[$((checkpoints + 1))]="Stop OpenRefine server"
echo "=== $checkpoints. ${checkpointname[$((checkpoints + 1))]} ==="
echo ""
echo "starting time: $(date --date=@${checkpointdate[$((checkpoints + 1))]})"
echo ""
cleanup
echo ""
# Grep log for exceptions
exceptions=$(grep -i exception "${log_dir}/${codename}_${date}.log" | grep -v "workspace")
if [ -n "$exceptions" ]; then
echo 1>&2 "$exceptions"
echo 1>&2 "Konfiguration scheint fehlerhaft zu sein! Bitte manuell prüfen."
exit 2
fi
# Ingest data into Solr
if [ -n "$solr_url" ]; then
checkpoints=${#checkpointdate[@]}
checkpointdate[$((checkpoints + 1))]=$(date +%s)
checkpointname[$((checkpoints + 1))]="Ingest data into Solr"
echo "=== $checkpoints. ${checkpointname[$((checkpoints + 1))]} ==="
echo ""
echo "starting time: $(date --date=@${checkpointdate[$((checkpoints + 1))]})"
echo ""
# read header from tsv
readarray multivalue_fields < <(head -n 1 "${data_dir}/02_transformed/${codename}_${date}.tsv" | sed 's/\t/\n/g')
for i in ${multivalue_fields[@]}; do
multivalue_config+=(\&f.$i.separator=$separator)
done
multivalue_config=$(printf %s "${multivalue_config[@]}")
for i in ${solr_url[@]}; do
echo "delete existing data in ${i}"
curl $(if [ -n "$solr_user" ]; then echo "-u ${solr_user}:${solr_pass}"; fi) -sS "${i}/update" -H "Content-Type: application/json" --data-binary "{ \"delete\": { \"query\": \"collectionId:${codename}\" } }" | jq .responseHeader
echo ""
echo "load new data in ${i}"
curl $(if [ -n "$solr_user" ]; then echo "-u ${solr_user}:${solr_pass}"; fi) --progress-bar "${i}/update/csv?commit=true&optimize=true&separator=%09&literal.collectionId=${codename}&split=true${multivalue_config}" --data-binary @- -H 'Content-type:text/plain; charset=utf-8' < ${data_dir}/02_transformed/${codename}_${date}.tsv | jq .responseHeader
echo ""
done
fi
# Ingest data into OpenRefine
if [ -n "$openrefine_url" ]; then
checkpoints=${#checkpointdate[@]}
checkpointdate[$((checkpoints + 1))]=$(date +%s)
checkpointname[$((checkpoints + 1))]="Ingest data into OpenRefine"
echo "=== $checkpoints. ${checkpointname[$((checkpoints + 1))]} ==="
echo ""
echo "starting time: $(date --date=@${checkpointdate[$((checkpoints + 1))]})"
echo ""
echo "delete existing project ${codename}_live..."
${openrefine_client} -H ${external_host} -P ${external_port} --delete "${codename}_live"
echo ""
echo "create new project ${codename}_live..."
${openrefine_client} -H ${external_host} -P ${external_port} --create "${data_dir}/02_transformed/${codename}_${date}.tsv" --encoding=UTF-8 --projectName=${codename}_live
echo ""
fi
# calculate and print checkpoints
echo "=== Statistics ==="
echo ""
checkpoints=${#checkpointdate[@]}
checkpointdate[$((checkpoints + 1))]=$(date +%s)
checkpointname[$((checkpoints + 1))]="End process"
echo "starting time and run time of each step:"
checkpoints=${#checkpointdate[@]}
checkpointdate[$((checkpoints + 1))]=$(date +%s)
for i in $(seq 1 $checkpoints); do
diffsec="$((${checkpointdate[$((i + 1))]} - ${checkpointdate[$i]}))"
printf "%35s $(date --date=@${checkpointdate[$i]}) ($(date -d@${diffsec} -u +%H:%M:%S))\n" "${checkpointname[$i]}"
done
echo ""
diffsec="$((checkpointdate[$checkpoints] - checkpointdate[1]))"
echo "$records_metha records"
echo "total run time: $(date -d@${diffsec} -u +%H:%M:%S) (hh:mm:ss)"
# calculate and print memory load
max=${memoryload[0]}
for n in "${memoryload[@]}" ; do
((n > max)) && max=$n
done
echo "highest memory load: $((max / 1024)) MB of $ram"