@@ -16,29 +16,46 @@ def load_age_ordered_articles(self,
1616 path_ludwig_data : Optional [Path ] = None ,
1717 ) -> List [Transcript ]:
1818
19+ """
20+ notes:
21+ age-ordering articles requires loading highest version first.
22+ (higher version -> higher simplification -> smaller grade level -> younger students)
23+ """
24+
25+ def path_to_version (p : Path ):
26+ return int (p .stem .split ('.' )[- 1 ])
27+
1928 if path_ludwig_data is None :
2029 path_ludwig_data = configs .Dirs .ludwig_data
2130
22- print ('Preparing AONewsela articles...' )
23- pbar = pyprind .ProgBar (6 , stream = 1 )
31+
32+ path_data = path_ludwig_data / 'AONewsela' / 'newsela_article_corpus_2016-01-29'
33+ print (f'Looking for articles in { path_data } ' )
34+ if not path_data .exists ():
35+ raise FileNotFoundError (f'Did not find { path_data } .'
36+ f' Do you have access to the UIUC Language Learning Lab shared drive?' )
37+
38+ # search directory once, to save time
39+ article_paths = [p for p in path_data .rglob ('*.en.*.txt' )]
40+ if not article_paths :
41+ raise RuntimeError (f'Did not find any articles in { article_paths } ' )
42+
43+ print (f'Preparing { len (article_paths )} AONewsela articles...' )
44+ pbar = pyprind .ProgBar (len (article_paths ), stream = 1 )
2445
2546 res = []
26- for version in reversed ( range ( 6 )): # simple first means highest version first
47+ for path_article in sorted ( article_paths , key = lambda p : path_to_version ( p ), reverse = True ):
2748
28- articles_path = path_ludwig_data / 'AONewsela' / 'newsela_article_corpus_2016-01-29'
29- print (f'Looking for articles in { articles_path } ' )
30- if not articles_path .exists ():
31- raise FileNotFoundError (f'Did not find { articles_path } .'
32- f' Do you have access to the UIUC Language Learning Lab shared drive?' )
49+ text = path_article .read_text (encoding = 'utf-8' ).replace ('\n ' , ' ' ).lower ()
3350
34- for path in articles_path . glob ( f'*.en. { version } .txt' ) :
35- text = path . read_text ( encoding = 'utf-8' ). replace ( ' \n ' , ' ' ). lower ()
51+ if not self . params . punctuation :
52+ raise NotImplementedError
3653
37- if not self .params .punctuation :
38- raise NotImplementedError
54+ res .append (Transcript (text , path_to_version (path_article )))
3955
40- res . append ( Transcript ( text , version ) )
56+ pbar . update ( )
4157
42- pbar .update ()
58+ print (f'Found { len (res )} articles' , flush = True )
59+ assert len (res ) == len (article_paths )
4360
4461 return res
0 commit comments