Skip to content

Commit b67b6b5

Browse files
committed
Add meca content provider
1 parent 09f3d53 commit b67b6b5

File tree

3 files changed

+132
-0
lines changed

3 files changed

+132
-0
lines changed

repo2docker/app.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ def _default_log_level(self):
154154
contentproviders.Swhid,
155155
contentproviders.Mercurial,
156156
contentproviders.Git,
157+
contentproviders.Meca,
157158
],
158159
config=True,
159160
help="""

repo2docker/contentproviders/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@
66
from .mercurial import Mercurial
77
from .swhid import Swhid
88
from .zenodo import Zenodo
9+
from .meca import Meca

repo2docker/contentproviders/meca.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
from .base import ContentProvider
2+
from requests import Session
3+
import os
4+
from hashlib import md5
5+
from os import path
6+
import tempfile
7+
import shutil
8+
import xml.etree.ElementTree as ET
9+
from zipfile import ZipFile, is_zipfile
10+
from urllib.parse import urlparse, urlunparse
11+
12+
def get_hashed_slug(url, changes_with_content):
13+
"""Return a unique slug that is invariant to query parameters in the url"""
14+
parsed_url = urlparse(url)
15+
stripped_url = urlunparse(
16+
(parsed_url.scheme, parsed_url.netloc, parsed_url.path, "", "", "")
17+
)
18+
19+
return "meca-" + md5(f"{stripped_url}-{changes_with_content}".encode()).hexdigest()
20+
21+
def fetch_zipfile(session, url, dst_dir):
22+
resp = session.get(url, headers={"accept": "application/zip"}, stream=True)
23+
resp.raise_for_status()
24+
25+
dst_filename = path.join(dst_dir, "meca.zip")
26+
with open(dst_filename, "wb") as dst:
27+
for chunk in resp.iter_content(chunk_size=128):
28+
dst.write(chunk)
29+
30+
return dst_filename
31+
32+
33+
def handle_items(_, item):
34+
print(item)
35+
36+
37+
def extract_validate_and_identify_bundle(zip_filename, dst_dir):
38+
if not os.path.exists(zip_filename):
39+
raise RuntimeError("Download MECA bundle not found")
40+
41+
if not is_zipfile(zip_filename):
42+
raise RuntimeError("MECA bundle is not a zip file")
43+
44+
with ZipFile(zip_filename, "r") as zip_ref:
45+
zip_ref.extractall(dst_dir)
46+
47+
try:
48+
manifest = path.join(dst_dir, "manifest.xml")
49+
if not os.path.exists(manifest):
50+
raise RuntimeError("MECA bundle is missing manifest.xml")
51+
article_source_dir = "bundle/"
52+
53+
tree = ET.parse(manifest)
54+
root = tree.getroot()
55+
56+
bundle_instance = root.findall(
57+
"{*}item[@item-type='article-source-directory']/{*}instance"
58+
)
59+
for attr in bundle_instance[0].attrib:
60+
if attr.endswith("href"):
61+
article_source_dir = bundle_instance[0].get(attr)
62+
63+
return True, path.join(dst_dir, article_source_dir)
64+
except:
65+
return False, dst_dir
66+
67+
68+
class Meca(ContentProvider):
69+
"""A repo2docker content provider for MECA bundles"""
70+
71+
def __init__(self):
72+
super().__init__()
73+
self.session = Session()
74+
self.session.headers.update(
75+
{
76+
"user-agent": f"repo2docker MECA",
77+
}
78+
)
79+
80+
def detect(self, spec, ref=None, extra_args=None):
81+
"""`spec` contains a faux protocol of meca+http[s] for detection purposes
82+
and we assume `spec` trusted as a reachable MECA bundle from an allowed origin
83+
(binderhub RepoProvider class already checking for this).
84+
85+
An other HEAD check in made here in order to get the content-length header
86+
"""
87+
parsed = urlparse(spec)
88+
if not parsed.scheme.endswith("+meca"):
89+
return None
90+
parsed = parsed._replace(scheme=parsed.scheme[:-5])
91+
url = urlunparse(parsed)
92+
93+
r = self.session.head(url)
94+
changes_with_content = r.headers.get("ETag") or r.headers.get("Content-Length")
95+
96+
self.hashed_slug = get_hashed_slug(url, changes_with_content)
97+
98+
return {"url": url, "slug": self.hashed_slug}
99+
100+
def fetch(self, spec, output_dir, yield_output=False):
101+
hashed_slug = spec["slug"]
102+
url = spec["url"]
103+
104+
yield f"Creating temporary directory.\n"
105+
with tempfile.TemporaryDirectory() as tmpdir:
106+
yield f"Temporary directory created at {tmpdir}.\n"
107+
108+
yield f"Fetching MECA Bundle {url}.\n"
109+
zip_filename = fetch_zipfile(self.session, url, tmpdir)
110+
111+
yield f"Extracting MECA Bundle {zip_filename}.\n"
112+
is_meca, bundle_dir = extract_validate_and_identify_bundle(
113+
zip_filename, tmpdir
114+
)
115+
116+
if not is_meca:
117+
yield f"This doesn't look like a meca bundle, extracting everything.\n"
118+
119+
yield f"Copying MECA Bundle at {bundle_dir} to {output_dir}.\n"
120+
files = os.listdir(bundle_dir)
121+
for f in files:
122+
shutil.move(os.path.join(bundle_dir, f), output_dir)
123+
124+
yield f"Removing temporary directory.\n"
125+
126+
yield f"MECA Bundle {hashed_slug} fetched and unpacked.\n"
127+
128+
@property
129+
def content_id(self):
130+
return self.hashed_slug

0 commit comments

Comments
 (0)