Skip to content

Commit ec0419a

Browse files
afermgshntnu
andauthored
chore: Replace profile index file format (csv->json) (#152)
* chore: Replace profile index file format (csv->json) * change: adjust update_etag.sh to match new json structure * docs: update readme for manifest update * Update profile_index.json: specify recipe and config permalinks * docs: add permalink information --------- Co-authored-by: Shantanu Singh <shsingh@broadinstitute.org>
1 parent d824c43 commit ec0419a

File tree

4 files changed

+76
-45
lines changed

4 files changed

+76
-45
lines changed

manifests/profile_index.csv

Lines changed: 0 additions & 9 deletions
This file was deleted.

manifests/profile_index.json

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
[
2+
{
3+
"subset": "orf",
4+
"url": "https://cellpainting-gallery.s3.amazonaws.com/cpg0016-jump-assembled/source_all/workspace/profiles/jump-profiling-recipe_2024_a917fa7/ORF/profiles_wellpos_cc_var_mad_outlier_featselect_sphering_harmony/profiles_wellpos_cc_var_mad_outlier_featselect_sphering_harmony.parquet",
5+
"recipe_permalink": "https://github.com/broadinstitute/jump-profiling-recipe/tree/a917fa79342ff92cf0ea05d6d9174d9028a90f8f",
6+
"config_permalink": "https://github.com/broadinstitute/jump-profiling-recipe/blob/a917fa79342ff92cf0ea05d6d9174d9028a90f8f/inputs/orf.json",
7+
"etag": "c05a241135dcedda4e9cc639480b3f8e-44"
8+
},
9+
{
10+
"subset": "crispr",
11+
"url": "https://cellpainting-gallery.s3.amazonaws.com/cpg0016-jump-assembled/source_all/workspace/profiles/jump-profiling-recipe_2024_a917fa7/CRISPR/profiles_wellpos_cc_var_mad_outlier_featselect_sphering_harmony_PCA_corrected/profiles_wellpos_cc_var_mad_outlier_featselect_sphering_harmony_PCA_corrected.parquet",
12+
"recipe_permalink": "https://github.com/broadinstitute/jump-profiling-recipe/tree/a917fa79342ff92cf0ea05d6d9174d9028a90f8f",
13+
"config_permalink": "https://github.com/broadinstitute/jump-profiling-recipe/blob/a917fa79342ff92cf0ea05d6d9174d9028a90f8f/inputs/crispr.json",
14+
"etag": "4c59782c0dd5244f67d14323e8325828-10"
15+
},
16+
{
17+
"subset": "compound",
18+
"url": "https://cellpainting-gallery.s3.amazonaws.com/cpg0016-jump-assembled/source_all/workspace/profiles/jump-profiling-recipe_2024_a917fa7/COMPOUND/profiles_var_mad_int_featselect_harmony/profiles_var_mad_int_featselect_harmony.parquet",
19+
"recipe_permalink": "https://github.com/broadinstitute/jump-profiling-recipe/tree/a917fa79342ff92cf0ea05d6d9174d9028a90f8f",
20+
"config_permalink": "https://github.com/broadinstitute/jump-profiling-recipe/blob/a917fa79342ff92cf0ea05d6d9174d9028a90f8f/inputs/compound.json",
21+
"etag": "1368a48ddbd4c44b1bfbc084591aaf10-338"
22+
},
23+
{
24+
"subset": "orf_interpretable",
25+
"url": "https://cellpainting-gallery.s3.amazonaws.com/cpg0016-jump-assembled/source_all/workspace/profiles/jump-profiling-recipe_2024_a917fa7/ORF/profiles_wellpos_cc_var_mad_outlier_featselect_sphering_harmony/profiles_wellpos_cc_var_mad_outlier.parquet",
26+
"recipe_permalink": "https://github.com/broadinstitute/jump-profiling-recipe/tree/a917fa79342ff92cf0ea05d6d9174d9028a90f8f",
27+
"config_permalink": "https://github.com/broadinstitute/jump-profiling-recipe/blob/a917fa79342ff92cf0ea05d6d9174d9028a90f8f/inputs/orf.json",
28+
"etag": "97b0c31d7d678ca2a5e2353df5799fd8-217"
29+
},
30+
{
31+
"subset": "crispr_interpretable",
32+
"url": "https://cellpainting-gallery.s3.amazonaws.com/cpg0016-jump-assembled/source_all/workspace/profiles/jump-profiling-recipe_2024_a917fa7/CRISPR/profiles_wellpos_cc_var_mad_outlier_featselect_sphering_harmony_PCA_corrected/profiles_wellpos_cc_var_mad_outlier.parquet",
33+
"recipe_permalink": "https://github.com/broadinstitute/jump-profiling-recipe/tree/a917fa79342ff92cf0ea05d6d9174d9028a90f8f",
34+
"config_permalink": "https://github.com/broadinstitute/jump-profiling-recipe/blob/a917fa79342ff92cf0ea05d6d9174d9028a90f8f/inputs/crispr.json",
35+
"etag": "90b08b824c06bcf16dfc5e788e74f099-135"
36+
},
37+
{
38+
"subset": "compound_interpretable",
39+
"url": "https://cellpainting-gallery.s3.amazonaws.com/cpg0016-jump-assembled/source_all/workspace/profiles/jump-profiling-recipe_2024_a917fa7/COMPOUND/profiles_var_mad_int_featselect_harmony/profiles_var_mad_int.parquet",
40+
"recipe_permalink": "https://github.com/broadinstitute/jump-profiling-recipe/tree/a917fa79342ff92cf0ea05d6d9174d9028a90f8f",
41+
"config_permalink": "https://github.com/broadinstitute/jump-profiling-recipe/blob/a917fa79342ff92cf0ea05d6d9174d9028a90f8f/inputs/compound.json",
42+
"etag": "b638fa24310db569bc869af92e16f69c-1444"
43+
},
44+
{
45+
"subset": "all",
46+
"url": "https://cellpainting-gallery.s3.amazonaws.com/cpg0016-jump-assembled/source_all/workspace/profiles/jump-profiling-recipe_2024_0224e0f/ALL/profiles_wellpos_cc_var_mad_outlier_featselect_sphering_harmony/profiles_wellpos_cc_var_mad_outlier_featselect_sphering_harmony.parquet",
47+
"recipe_permalink": "https://github.com/broadinstitute/jump-profiling-recipe/tree/0224e0fc23a84e7e84b091f320a9e68b3217343f",
48+
"config_permalink": "https://github.com/broadinstitute/jump-profiling-recipe/blob/0224e0fc23a84e7e84b091f320a9e68b3217343f/inputs/pipeline_2.json",
49+
"etag": "71d03c195e41739af0f1ba64b4f6be73-324"
50+
},
51+
{
52+
"subset": "all_interpretable",
53+
"url": "https://cellpainting-gallery.s3.amazonaws.com/cpg0016-jump-assembled/source_all/workspace/profiles/jump-profiling-recipe_2024_0224e0f/ALL/profiles_wellpos_cc_var_mad_outlier_featselect_sphering_harmony/profiles_wellpos_cc_var_mad_outlier_featselect.parquet",
54+
"recipe_permalink": "https://github.com/broadinstitute/jump-profiling-recipe/tree/0224e0fc23a84e7e84b091f320a9e68b3217343f",
55+
"config_permalink": "https://github.com/broadinstitute/jump-profiling-recipe/blob/0224e0fc23a84e7e84b091f320a9e68b3217343f/inputs/pipeline_2.json",
56+
"etag": "023d74cbf007bb6d837724ac8aa78fb4-324"
57+
}
58+
]

manifests/src/README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,23 @@
11
# Automated versioning with Zenodo
22

3-
The scripts in this folder are used for automated versioning by uploading the manifest file (`profile_index.csv`, currently the only one in the root folder) to Zenodo.
3+
The scripts in this folder are used for automated versioning by uploading the manifest file (`profile_index.json`, currently the only one in the root folder) to Zenodo.
44
In the future, additional manifest files will be added and updated in this repository, triggering the same automated versioning process.
55

66
## Updating new versions
77

8-
To release a new set of assembled JUMP profiles, manually update the URLs in `profile_index.csv` to point to the new location.
9-
If necessary, update the associated names for new dataset types.
8+
To release a new set of assembled JUMP profiles, manually update the URLs in `profile_index.json` to point to the new location.
9+
If necessary, update the associated names for new dataset types and (optionally) the permanent link to the version of the recipe that produced them.
1010

1111
## Update ETags to reflect new URLs
1212

1313
After updating a URL, the ETag (provided by S3) will no longer match. To update the ETags, run the following command from the home folder:
1414

1515
```bash
16-
bash manifests/src/update_etags.sh manifests/profile_index.csv| sponge manifests/profile_index.csv
16+
bash manifests/src/update_etags.sh manifests/profile_index.json | sponge manifests/profile_index.json
1717
```
1818

1919
Note: If using Nix, all dependencies are already included in the flake at the root folder. Simply run `nix develop --extra-experimental-features nix-command --extra-experimental-features flakes` before the above command.
2020

2121
## Commit changes
2222

23-
Add and commit the updated `profile_index.csv`. This should trigger an update on Zenodo. Once the update is complete, the csv files in the repository and on Zenodo should match.
23+
Add and commit the updated `profile_index.json`. This should trigger an update on Zenodo. Once the update is complete, the json files in the repository and on Zenodo should match.

manifests/src/update_etags.sh

Lines changed: 13 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,20 @@
11
#!/usr/bin/env bash
2-
# Fetch updated ETag values for URLs in a CSV file.
2+
# Fetch updated ETag values for URLs in a json file
3+
# Usage: `bash update_etags.sh profile_index.json`
34

4-
# Note that quotes are expected in the csv but ommited when
55
input_file="$1"
6-
url_header="url"
7-
etag_header="etag"
86

9-
get_column() {
10-
# gets id of column $1 in ${input_file}.
11-
awk -F',' -v col="\"$1\"" 'NR==1 { for (i=1; i<=NF; ++i) { if ($i==col) print i } }' "${input_file}"
12-
}
7+
urls=$(jq .[].url "${input_file}" | tr -d '"')
138

14-
# Check if input file is provided
15-
if [ -z "${input_file}" ]; then
16-
echo "Usage: $0 <input_file>"
17-
exit 1
18-
fi
9+
# Pull and clean the etag from the AWS url
10+
NEW_ETAGS=$(printf "${urls}" | xargs -I {} sh -c "curl -I --silent {} | awk '/ETag:/ {print $2}' | cut -f2 -d' ' | tr -d '\"' | tr -d '\r'")
1911

20-
url_column=$(get_column "${url_header}")
21-
urls=$(awk -F',' -v col="${url_column}" 'NR>1 {gsub(/^"|"$/, "", $col); print $col}' "${input_file}")
12+
# Format the list into json
13+
JSON_LIST=$(printf '%s\n' "${NEW_ETAGS}" | jq -R . | jq -s .)
2214

23-
# Fetch ETags for each URL in a loop
24-
etag_values='"etag"'
25-
while IFS= read -r url; do
26-
etag=$(curl -I --silent "${url}" | awk '/ETag:/ {print $2}')
27-
etag_values+="\n${etag}"
28-
done <<<"$urls"
29-
30-
# Remove existing ETag column if present
31-
etag_column=$(get_column "${etag_header}")
32-
33-
# Combine original data (without ETag) with new ETag values
34-
if [[ -n "${etag_column}" ]]; then # Replace $etag_column in $input_file with $etag_values
35-
awk -F',' -v OFS=',' -v col="${etag_column}" 'NR==FNR{a[NR]=$1;next}{$col=a[FNR]}1' <(echo -e "${etag_values}") "${input_file}"
36-
else # Append $etag_values as a new column on the right
37-
paste -d',' "${input_file}" <(echo -e "${etag_values}")
38-
fi
15+
# Print again the list with the updated etag
16+
# Note that this assumes that every entry provides an etag
17+
jq --argjson etags "${JSON_LIST}" '[
18+
range(0; length) as $i
19+
| {subset: .[$i].subset, url: .[$i].url, recipe_permalink: .[$i].recipe_permalink, config_permalink: .[$i].config_permalink, etag: $etags[$i]}
20+
]' profile_index.json

0 commit comments

Comments
 (0)