Skip to content

Commit 3a4bde0

Browse files
committed
add setup.py
1 parent ee04640 commit 3a4bde0

File tree

5 files changed

+54
-16
lines changed

5 files changed

+54
-16
lines changed

aonewsela/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
3+
__version__ = '1.0.0'

aonewsela/configs.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@
44
class Dirs:
55
src = Path(__file__).parent
66
root = src.parent
7-
ludwig_data = '/media/ludwig_data' # path to location of shared drive, mounted on local machine
7+
mnt = Path('/media')
8+
ludwig_data = mnt / 'ludwig_data' # path to location of shared drive, mounted on local machine

aonewsela/pipeline.py

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,29 +16,46 @@ def load_age_ordered_articles(self,
1616
path_ludwig_data: Optional[Path] = None,
1717
) -> List[Transcript]:
1818

19+
"""
20+
notes:
21+
age-ordering articles requires loading highest version first.
22+
(higher version -> higher simplification -> smaller grade level -> younger students)
23+
"""
24+
25+
def path_to_version(p: Path):
26+
return int(p.stem.split('.')[-1])
27+
1928
if path_ludwig_data is None:
2029
path_ludwig_data = configs.Dirs.ludwig_data
2130

22-
print('Preparing AONewsela articles...')
23-
pbar = pyprind.ProgBar(6, stream=1)
31+
32+
path_data = path_ludwig_data / 'AONewsela' / 'newsela_article_corpus_2016-01-29'
33+
print(f'Looking for articles in {path_data}')
34+
if not path_data.exists():
35+
raise FileNotFoundError(f'Did not find {path_data}.'
36+
f' Do you have access to the UIUC Language Learning Lab shared drive?')
37+
38+
# search directory once, to save time
39+
article_paths = [p for p in path_data.rglob('*.en.*.txt')]
40+
if not article_paths:
41+
raise RuntimeError(f'Did not find any articles in {article_paths}')
42+
43+
print(f'Preparing {len(article_paths)} AONewsela articles...')
44+
pbar = pyprind.ProgBar(len(article_paths), stream=1)
2445

2546
res = []
26-
for version in reversed(range(6)): # simple first means highest version first
47+
for path_article in sorted(article_paths, key=lambda p: path_to_version(p), reverse=True):
2748

28-
articles_path = path_ludwig_data / 'AONewsela' / 'newsela_article_corpus_2016-01-29'
29-
print(f'Looking for articles in {articles_path}')
30-
if not articles_path.exists():
31-
raise FileNotFoundError(f'Did not find {articles_path}.'
32-
f' Do you have access to the UIUC Language Learning Lab shared drive?')
49+
text = path_article.read_text(encoding='utf-8').replace('\n', ' ').lower()
3350

34-
for path in articles_path.glob(f'*.en.{version}.txt'):
35-
text = path.read_text(encoding='utf-8').replace('\n', ' ').lower()
51+
if not self.params.punctuation:
52+
raise NotImplementedError
3653

37-
if not self.params.punctuation:
38-
raise NotImplementedError
54+
res.append(Transcript(text, path_to_version(path_article)))
3955

40-
res.append(Transcript(text, version))
56+
pbar.update()
4157

42-
pbar.update()
58+
print(f'Found {len(res)} articles', flush=True)
59+
assert len(res) == len(article_paths)
4360

4461
return res

requirements.txt

Lines changed: 0 additions & 1 deletion
This file was deleted.

setup.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from setuptools import setup
2+
3+
from aonewsela import __name__, __version__
4+
5+
setup(
6+
name=__name__,
7+
version=__version__,
8+
packages=[__name__],
9+
include_package_data=True,
10+
install_requires=[
11+
'pyprind',
12+
],
13+
url='https://github.com/UIUCLearningLanguageLab/AOCHILDES',
14+
license='',
15+
author='Philip Huebner',
16+
author_email='info@philhuebner.com',
17+
description='Retrieve text from the American-English CHILDES database'
18+
)

0 commit comments

Comments
 (0)