Skip to content

Commit 18e287c

Browse files
shntnuafermg
andauthored
Make zenodo scripts reusable (#129)
* Update update_etags.sh * Update upload_index.sh * Update upload_index.sh * Update upload_index.sh * Update upload_index.sh * use standard form, check token sooner * quote variables, drop csvkit dep * quote variable * drop csvkit dep * cleanup * shebang, formatting * bug fix * simplify regex * deps: add shekllcheck and shfmt to flake.nix * chore: quote to avoid potential globbing * format: apply shfmt to scripts * fix(upload_index.sh): use /usr/bin/env bash shebang * fix: quote 'etag' header; refactor get_column * refactor: replace column in place; support etag in any col --------- Co-authored-by: Alán F. Muñoz <afer.mg@gmail.com>
1 parent 074a8a8 commit 18e287c

File tree

3 files changed

+120
-73
lines changed

3 files changed

+120
-73
lines changed

flake.nix

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
curl
2828
gawk
2929
moreutils
30+
shellcheck
31+
shfmt
3032
];
3133
};
3234
};

manifests/src/update_etags.sh

Lines changed: 37 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,38 @@
11
#!/usr/bin/env bash
2-
# Returns the updated ETag for elements in the second column of $1 alongside the first two columns.
3-
cat $1 |
4-
tail -n +2 | # Remove headers
5-
cut -f2 -d',' | # Select url column
6-
xargs -I {} -- curl -I --silent "{}" | # Fetch remote metadata
7-
grep "ETag" | # Select etag field from resulting html
8-
awk '{print $2}' | # Remove prefix
9-
sed 's/\r$//' | # Remove carriage
10-
sed 1i'"etag"' | # add header
11-
paste - $1 -d',' | # Merge with original file
12-
awk -F ',' '{print $2","$3","$1}' # Print in the right order
2+
# Fetch updated ETag values for URLs in a CSV file.
3+
4+
# Note that quotes are expected in the csv but ommited when
5+
input_file="$1"
6+
url_header="url"
7+
etag_header="etag"
8+
9+
get_column() {
10+
# gets id of column $1 in ${input_file}.
11+
awk -F',' -v col="\"$1\"" 'NR==1 { for (i=1; i<=NF; ++i) { if ($i==col) print i } }' "${input_file}"
12+
}
13+
14+
# Check if input file is provided
15+
if [ -z "${input_file}" ]; then
16+
echo "Usage: $0 <input_file>"
17+
exit 1
18+
fi
19+
20+
url_column=$(get_column "${url_header}")
21+
urls=$(awk -F',' -v col="${url_column}" 'NR>1 {gsub(/^"|"$/, "", $col); print $col}' "${input_file}")
22+
23+
# Fetch ETags for each URL in a loop
24+
etag_values='"etag"'
25+
while IFS= read -r url; do
26+
etag=$(curl -I --silent "${url}" | awk '/ETag:/ {print $2}')
27+
etag_values+="\n${etag}"
28+
done <<<"$urls"
29+
30+
# Remove existing ETag column if present
31+
etag_column=$(get_column "${etag_header}")
32+
33+
# Combine original data (without ETag) with new ETag values
34+
if [[ -n "${etag_column}" ]]; then # Replace $etag_column in $input_file with $etag_values
35+
awk -F',' -v OFS=',' -v col="${etag_column}" 'NR==FNR{a[NR]=$1;next}{$col=a[FNR]}1' <(echo -e "${etag_values}") "${input_file}"
36+
else # Append $etag_values as a new column on the right
37+
paste -d',' "${input_file}" <(echo -e "${etag_values}")
38+
fi

manifests/src/upload_index.sh

Lines changed: 81 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,101 +1,120 @@
1+
#!/usr/bin/env bash
12
# Find the latest version of the dataset
2-
ZENODO_ENDPOINT="https://zenodo.org"
3-
DEPOSITION_PREFIX="${ZENODO_ENDPOINT}/api/deposit/depositions"
43
ORIGINAL_ID="13892061"
54
FILE_TO_VERSION="manifests/profile_index.csv"
6-
FILENAME=$(echo ${FILE_TO_VERSION} | sed 's+.*/++g')
5+
METADATA_JSON='{
6+
"metadata": {
7+
"title": "The Joint Undertaking for Morphological Profiling (JUMP) Consortium Datasets Index",
8+
"creators": [
9+
{
10+
"name": "The JUMP Cell Painting Consortium"
11+
}
12+
],
13+
"upload_type": "dataset",
14+
"access_right": "open"
15+
}
16+
}'
17+
18+
ZENODO_ENDPOINT="https://zenodo.org"
19+
DEPOSITION_PREFIX="${ZENODO_ENDPOINT}/api/deposit/depositions"
20+
21+
FILENAME=${FILE_TO_VERSION##*/}
722

823
echo "Checking that S3 ETags match their local counterpart"
9-
S3_ETAGS=$(cat ${FILE_TO_VERSION} | tail -n +2 | cut -f2 -d',' | xargs -I {} -- curl -I --silent "{}" | grep ETag | awk '{print $2}' | sed 's/\r$//' | md5sum | cut -f1 -d" ")
10-
LOCAL_ETAGS=$(cat ${FILE_TO_VERSION} | tail -n +2 | cut -f3 -d',' | md5sum | cut -f1 -d" ")
1124

12-
echo "Remote ${S3_ETAGS} vs Local ${LOCAL_ETAGS} values"
13-
if [ "${S3_ETAGS}" != "${LOCAL_ETAGS}" ]; then
14-
echo "At least one ETag does not match their url."
15-
exit 1
16-
fi
25+
# Extract URLs and ETags
26+
url_column=$(head -n1 "${FILE_TO_VERSION}" | tr ',' '\n' | grep -n "url" | cut -d':' -f1)
27+
urls=$(awk -F',' -v col="${url_column}" 'NR>1 {gsub(/^"|"$/, "", $col); print $col}' "${FILE_TO_VERSION}")
1728

18-
if [ -z "${ORIGINAL_ID}" ]; then # Only get latest id when provided an original one
19-
echo "Creating new deposition"
20-
DEPOSITION_ENDPOINT="${DEPOSITION_PREFIX}"
21-
else # Update existing dataset
22-
echo "Previous ID Exists"
23-
LATEST_ID=$(curl "${ZENODO_ENDPOINT}/records/${ORIGINAL_ID}/latest" |
24-
grep records | sed 's/.*href=".*\.org\/records\/\(.*\)".*/\1/')
25-
REMOTE_HASH=$(curl -H "Content-Type: application/json" -X GET --data "{}" \
26-
"${DEPOSITION_PREFIX}/${LATEST_ID}/files?access_token=${ZENODO_TOKEN}" |
27-
jq ".[] .links .download" | xargs curl | md5sum | cut -f1 -d" ")
28-
LOCAL_HASH=$(md5sum ${FILE_TO_VERSION} | cut -f1 -d" ")
29-
30-
echo "Checking for changes in file contents: Remote ${REMOTE_HASH} vs Local ${LOCAL_HASH}"
31-
if [ "${REMOTE_HASH}" == "${LOCAL_HASH}" ]; then
32-
echo "The urls and md5sums have not changed"
33-
exit 0
34-
fi
35-
36-
echo "Creating new version"
37-
DEPOSITION_ENDPOINT="${DEPOSITION_PREFIX}/${LATEST_ID}/actions/newversion"
38-
fi
29+
etag_column=$(head -n1 "${FILE_TO_VERSION}" | tr ',' '\n' | grep -n "etag" | cut -d':' -f1)
30+
local_etags=$(awk -F',' -v col="${etag_column}" 'NR>1 {gsub(/^"|"$/, "", $col); print $col}' "${FILE_TO_VERSION}")
3931

32+
s3_etags=""
33+
while IFS= read -r url; do
34+
etag=$(curl -I --silent "$url" | awk '/[eE][tT]ag:/ {print $2}' | tr -d '\r"')
35+
s3_etags+="${etag}\n"
36+
done <<<"${urls}"
37+
38+
# Remove the trailing newline from s3_etags
39+
s3_etags=$(echo -e "${s3_etags}" | sed '/^$/d')
40+
41+
# Calculate checksums for comparison
42+
s3_etags_hash=$(echo -e "${s3_etags}" | md5sum | cut -f1 -d" ")
43+
local_etags_hash=$(echo "${local_etags}" | md5sum | cut -f1 -d" ")
44+
45+
echo "Remote ${s3_etags_hash} vs Local ${local_etags_hash} values"
46+
if [ "${s3_etags_hash}" != "${local_etags_hash}" ]; then
47+
echo "At least one ETag does not match their url."
48+
exit 1
49+
fi
4050

4151
if [ -z "${ZENODO_TOKEN}" ]; then # Check Zenodo Token
42-
echo "Access token not available"
43-
exit 1
52+
echo "Access token not available"
53+
exit 1
4454
else
45-
echo "Access token found."
55+
echo "Access token found."
4656
fi
4757

58+
if [ -z "${ORIGINAL_ID}" ]; then # Only get latest id when provided an original one
59+
echo "Creating new deposition"
60+
DEPOSITION_ENDPOINT="${DEPOSITION_PREFIX}"
61+
else # Update existing dataset
62+
echo "Previous ID Exists"
63+
LATEST_ID=$(curl "${ZENODO_ENDPOINT}/records/${ORIGINAL_ID}/latest" |
64+
grep records | sed 's/.*href=".*\.org\/records\/\(.*\)".*/\1/')
65+
REMOTE_HASH=$(curl -H "Content-Type: application/json" -X GET --data "{}" \
66+
"${DEPOSITION_PREFIX}/${LATEST_ID}/files?access_token=${ZENODO_TOKEN}" |
67+
jq ".[] .links .download" | xargs curl | md5sum | cut -f1 -d" ")
68+
LOCAL_HASH=$(md5sum ${FILE_TO_VERSION} | cut -f1 -d" ")
69+
70+
echo "Checking for changes in file contents: Remote ${REMOTE_HASH} vs Local ${LOCAL_HASH}"
71+
if [ "${REMOTE_HASH}" == "${LOCAL_HASH}" ]; then
72+
echo "The urls and md5sums have not changed"
73+
exit 0
74+
fi
75+
76+
echo "Creating new version"
77+
DEPOSITION_ENDPOINT="${DEPOSITION_PREFIX}/${LATEST_ID}/actions/newversion"
78+
fi
4879

4980
# Create new deposition
5081
DEPOSITION=$(curl -H "Content-Type: application/json" \
51-
-X POST\
52-
--data "{}" \
53-
"${DEPOSITION_ENDPOINT}?access_token=${ZENODO_TOKEN}"\
54-
| jq .id)
82+
-X POST --data "{}" \
83+
"${DEPOSITION_ENDPOINT}?access_token=${ZENODO_TOKEN}" |
84+
jq .id)
5585
echo "New deposition ID is ${DEPOSITION}"
5686

5787
# Variables
5888
BUCKET_DATA=$(curl "${DEPOSITION_PREFIX}/${DEPOSITION}?access_token=${ZENODO_TOKEN}")
5989
BUCKET=$(echo "${BUCKET_DATA}" | jq --raw-output .links.bucket)
6090

6191
if [ "${BUCKET}" = "null" ]; then
62-
echo "Could not find URL for upload. Response from server:"
63-
echo "${BUCKET_DATA}"
64-
exit 1
92+
echo "Could not find URL for upload. Response from server:"
93+
echo "${BUCKET_DATA}"
94+
exit 1
6595
fi
6696

6797
# Upload file
6898
echo "Uploading file ${FILE_TO_VERSION} to bucket ${BUCKET}"
6999
cat ${FILE_TO_VERSION}
70100
curl -o /dev/null \
71-
--upload-file ${FILE_TO_VERSION} \
72-
${BUCKET}/${FILENAME}?access_token="${ZENODO_TOKEN}"
73-
101+
--upload-file ${FILE_TO_VERSION} \
102+
"${BUCKET}"/"${FILENAME}"?access_token="${ZENODO_TOKEN}"
74103

75104
# Upload Metadata
76-
echo -e '{"metadata": {
77-
"title": "The Joint Undertaking for Morphological Profiling (JUMP) Consortium Datasets Index",
78-
"creators": [
79-
{
80-
"name": "The JUMP Cell Painting Consortium"
81-
}
82-
],
83-
"upload_type": "dataset",
84-
"access_right": "open"
85-
}}' > metadata.json
105+
echo -e "${METADATA_JSON}" >metadata.json
86106

87107
NEW_DEPOSITION_ENDPOINT="${DEPOSITION_PREFIX}/${DEPOSITION}"
88108
echo "Uploading file to ${NEW_DEPOSITION_ENDPOINT}"
89109
curl -H "Content-Type: application/json" \
90-
-X PUT\
91-
--data @metadata.json \
92-
"${NEW_DEPOSITION_ENDPOINT}?access_token=${ZENODO_TOKEN}"
110+
-X PUT \
111+
--data @metadata.json \
112+
"${NEW_DEPOSITION_ENDPOINT}?access_token=${ZENODO_TOKEN}"
93113

94114
# Publish
95115
echo "Publishing to ${NEW_DEPOSITION_ENDPOINT}"
96116
curl -H "Content-Type: application/json" \
97-
-X POST\
98-
--data "{}"\
99-
"${NEW_DEPOSITION_ENDPOINT}/actions/publish?access_token=${ZENODO_TOKEN}"\
100-
| jq .id
101-
117+
-X POST \
118+
--data "{}" \
119+
"${NEW_DEPOSITION_ENDPOINT}/actions/publish?access_token=${ZENODO_TOKEN}" |
120+
jq .id

0 commit comments

Comments
 (0)