Skip to content

Commit ef7aba3

Browse files
bors[bot]CarloLucibelloDhairya Gandhi
authored
Merge #1442
1442: Soft deprecation for Datasets r=DhairyaLGandhi a=CarloLucibello Add a soft deprecations path for the datasets I brutally removed in #1377 . I added the old tests back, verified they passed, then removed them again. Fix #1426 Co-authored-by: Carlo Lucibello <carlo.lucibello@gmail.com> Co-authored-by: Dhairya Gandhi <dhairya@juliacopmuting.com>
2 parents 16235e7 + 402c72b commit ef7aba3

File tree

10 files changed

+668
-24
lines changed

10 files changed

+668
-24
lines changed

bors.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
status = [
2-
"ci/gitlab%"
2+
"buildkite/flux-dot-jl"
33
]
44
timeout-sec = 7200

src/data/Data.jl

Lines changed: 53 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -7,32 +7,62 @@ include("dataloader.jl")
77
export DataLoader
88

99

10-
## TODO: remove in v0.13 ##############
11-
module MNIST
12-
images() = error("Dataset is deprecated, use MLDatasets.jl instead.")
13-
labels() = error("Dataset is deprecated, use MLDatasets.jl instead.")
14-
end
15-
module Iris
16-
features() = error("Dataset is deprecated, use MLDatasets.jl instead.")
17-
labels() = error("Dataset is deprecated, use MLDatasets.jl instead.")
10+
## TODO for v0.13: remove everything below ##############
11+
## Also remove the following deps:
12+
## AbstractTrees, ZipFiles, CodecZLib
13+
14+
import ..Flux
15+
import SHA
16+
17+
deprecation_message() = @warn("Flux's datasets are deprecated, please use the package MLDatasets.jl")
18+
19+
function deps(path...)
20+
if isnothing(@__DIR__) # sysimages
21+
joinpath("deps", path...)
22+
else
23+
joinpath(@__DIR__, "..", "..", "deps", path...)
24+
end
1825
end
19-
module FashionMNIST
20-
images() = error("Dataset is deprecated, use MLDatasets.jl instead.")
21-
labels() = error("Dataset is deprecated, use MLDatasets.jl instead.")
22-
end
23-
module CMUDict
24-
phones() = error("Dataset is deprecated, use MLDatasets.jl instead.")
25-
symbols() = error("Dataset is deprecated, use MLDatasets.jl instead.")
26-
rawdict() = error("Dataset is deprecated, use MLDatasets.jl instead.")
27-
cmudict() = error("Dataset is deprecated, use MLDatasets.jl instead.")
26+
27+
function download_and_verify(url, path, hash)
28+
tmppath = tempname()
29+
download(url, tmppath)
30+
hash_download = open(tmppath) do f
31+
bytes2hex(SHA.sha256(f))
32+
end
33+
if hash_download !== hash
34+
msg = "Hash Mismatch!\n"
35+
msg *= " Expected sha256: $hash\n"
36+
msg *= " Calculated sha256: $hash_download"
37+
error(msg)
38+
end
39+
mv(tmppath, path; force=true)
2840
end
29-
module Sentiment
30-
train() = error("Dataset is deprecated, use MLDatasets.jl instead.")
31-
test() = error("Dataset is deprecated, use MLDatasets.jl instead.")
32-
dev() = error("Dataset is deprecated, use MLDatasets.jl instead.")
41+
42+
function __init__()
43+
mkpath(deps())
3344
end
3445

35-
export MNIST, Iris, FashionMNIST, CMUDict, Sentiment
46+
include("mnist.jl")
47+
export MNIST
48+
49+
include("fashion-mnist.jl")
50+
export FashionMNIST
51+
52+
include("cmudict.jl")
53+
export CMUDict
54+
using .CMUDict; export cmudict
55+
56+
include("tree.jl")
57+
include("sentiment.jl")
58+
export Sentiment
59+
60+
include("iris.jl")
61+
export Iris
62+
63+
include("housing.jl")
64+
export Housing
65+
3666
#########################################
3767

38-
end#module
68+
end#module

src/data/cmudict.jl

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
module CMUDict
2+
3+
export cmudict
4+
5+
using ..Data: deps, download_and_verify, deprecation_message
6+
7+
const version = "0.7b"
8+
const cache_prefix = "https://cache.julialang.org"
9+
10+
function load()
11+
suffixes_and_hashes = [("" , "209a8b4cd265013e96f4658632a9878103b0c5abf62b50d4ef3ae1be226b29e4"),
12+
(".phones" , "ffb588a5e55684723582c7256e1d2f9fadb130011392d9e59237c76e34c2cfd6"),
13+
(".symbols", "408ccaae803641c6d7b626b6299949320c2dbca96b2220fd3fb17887b023b027")]
14+
if isdir(deps("cmudict"))
15+
if all(isfile(deps("cmudict", "cmudict$x")) for (x, _) in suffixes_and_hashes)
16+
return
17+
end
18+
end
19+
@info "Downloading CMUDict dataset"
20+
mkpath(deps("cmudict"))
21+
for (x, hash) in suffixes_and_hashes
22+
download_and_verify("$cache_prefix/https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-$version$x",
23+
deps("cmudict", "cmudict$x"), hash)
24+
end
25+
end
26+
27+
"""
28+
phones()
29+
Return a `Vector` containing the phones used in the CMU Pronouncing Dictionary.
30+
"""
31+
function phones()
32+
deprecation_message()
33+
load()
34+
Symbol.(first.(split.(split(read(deps("cmudict", "cmudict.phones"),String),
35+
"\n", keepempty = false), "\t")))
36+
end
37+
38+
"""
39+
symbols()
40+
Return a `Vector` containing the symbols used in the CMU Pronouncing Dictionary.
41+
A symbol is a phone with optional auxiliary symbols, indicating for example the
42+
amount of stress on the phone.
43+
"""
44+
function symbols()
45+
deprecation_message()
46+
load()
47+
Symbol.(split(read(deps("cmudict", "cmudict.symbols"),String),
48+
"\n", keepempty = false))
49+
end
50+
51+
"""
52+
rawdict()
53+
Return the unfiltered CMU Pronouncing Dictionary.
54+
"""
55+
function rawdict()
56+
deprecation_message()
57+
load()
58+
Dict(String(xs[1]) => Symbol.(xs[2:end]) for xs in
59+
filter(!isempty, split.(split(read(deps("cmudict", "cmudict"),String), "\n"))))
60+
end
61+
62+
validword(s) = isascii(s) && occursin(r"^[\w\-\.]+$", s)
63+
64+
"""
65+
cmudict()
66+
Return a filtered CMU Pronouncing Dictionary.
67+
It is filtered so each word contains only ASCII characters and a combination of
68+
word characters (as determined by the regex engine using `\\w`), '-' and '.'.
69+
"""
70+
function cmudict()
71+
deprecation_message()
72+
filter(p -> validword(p.first), rawdict())
73+
end
74+
75+
alphabet() = ['A':'Z'..., '0':'9'..., '_', '-', '.']
76+
77+
end

src/data/fashion-mnist.jl

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
module FashionMNIST
2+
3+
using ..MNIST: gzopen, imageheader, rawimage, labelheader, rawlabel
4+
using ..Data: download_and_verify, deprecation_message
5+
6+
const dir = if isnothing(@__DIR__)
7+
joinpath("deps", "fashion-mnist")
8+
else
9+
joinpath(@__DIR__, "../../deps/fashion-mnist")
10+
end
11+
12+
function load()
13+
mkpath(dir)
14+
cd(dir) do
15+
for (file, hash) in [("train-images-idx3-ubyte", "3aede38d61863908ad78613f6a32ed271626dd12800ba2636569512369268a84"),
16+
("train-labels-idx1-ubyte", "a04f17134ac03560a47e3764e11b92fc97de4d1bfaf8ba1a3aa29af54cc90845"),
17+
("t10k-images-idx3-ubyte" , "346e55b948d973a97e58d2351dde16a484bd415d4595297633bb08f03db6a073"),
18+
("t10k-labels-idx1-ubyte" , "67da17c76eaffca5446c3361aaab5c3cd6d1c2608764d35dfb1850b086bf8dd5")]
19+
isfile(file) && continue
20+
@info "Downloading Fashion-MNIST dataset"
21+
download_and_verify("http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/$file.gz", "$file.gz", hash)
22+
open(file, "w") do io
23+
write(io, gzopen(read, "$file.gz"))
24+
end
25+
end
26+
end
27+
end
28+
29+
const TRAINIMAGES = joinpath(dir, "train-images-idx3-ubyte")
30+
const TRAINLABELS = joinpath(dir, "train-labels-idx1-ubyte")
31+
const TESTIMAGES = joinpath(dir, "t10k-images-idx3-ubyte")
32+
const TESTLABELS = joinpath(dir, "t10k-labels-idx1-ubyte")
33+
34+
"""
35+
images()
36+
images(:test)
37+
Load the Fashion-MNIST images.
38+
Each image is a 28×28 array of `Gray` colour values
39+
(see [Colors.jl](https://github.com/JuliaGraphics/Colors.jl)).
40+
Return the 60,000 training images by default; pass `:test` to retrieve the
41+
10,000 test images.
42+
"""
43+
function images(set = :train)
44+
deprecation_message()
45+
load()
46+
io = IOBuffer(read(set == :train ? TRAINIMAGES : TESTIMAGES))
47+
_, N, nrows, ncols = imageheader(io)
48+
[rawimage(io) for _ in 1:N]
49+
end
50+
51+
"""
52+
labels()
53+
labels(:test)
54+
Load the labels corresponding to each of the images returned from [`images()`](@ref).
55+
Each label is a number from 0-9.
56+
Return the 60,000 training labels by default; pass `:test` to retrieve the
57+
10,000 test labels.
58+
"""
59+
function labels(set = :train)
60+
deprecation_message()
61+
load()
62+
io = IOBuffer(read(set == :train ? TRAINLABELS : TESTLABELS))
63+
_, N = labelheader(io)
64+
[rawlabel(io) for _ = 1:N]
65+
end
66+
67+
end

src/data/housing.jl

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
"""
2+
1. Title: Boston Housing Data
3+
2. Sources:
4+
(a) Origin: This dataset was taken from the StatLib library which is
5+
maintained at Carnegie Mellon University.
6+
(b) Creator: Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the
7+
demand for clean air', J. Environ. Economics & Management,
8+
vol.5, 81-102, 1978.
9+
(c) Date: July 7, 1993
10+
3. Number of Instances: 506
11+
4. Number of Attributes: 13 continuous attributes (including "class"
12+
attribute "MEDV"), 1 binary-valued attribute.
13+
5. Attribute Information:
14+
1. CRIM per capita crime rate by town
15+
2. ZN proportion of residential land zoned for lots over
16+
25,000 sq.ft.
17+
3. INDUS proportion of non-retail business acres per town
18+
4. CHAS Charles River dummy variable (= 1 if tract bounds
19+
river; 0 otherwise)
20+
5. NOX nitric oxides concentration (parts per 10 million)
21+
6. RM average number of rooms per dwelling
22+
7. AGE proportion of owner-occupied units built prior to 1940
23+
8. DIS weighted distances to five Boston employment centres
24+
9. RAD index of accessibility to radial highways
25+
10. TAX full-value property-tax rate per 10,000 dollars
26+
11. PTRATIO pupil-teacher ratio by town
27+
12. B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks
28+
by town
29+
13. LSTAT % lower status of the population
30+
14. MEDV Median value of owner-occupied homes in 1000's of dollars
31+
Downloaded From: https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data
32+
"""
33+
module Housing
34+
35+
using DelimitedFiles
36+
using ..Data: deps, download_and_verify, deprecation_message
37+
38+
#Uncomment if package exists
39+
#const cache_prefix = "https://cache.julialang.org/"
40+
const cache_prefix = ""
41+
42+
function load()
43+
isfile(deps("housing.data")) && return
44+
45+
@info "Downloading the Boston housing Dataset"
46+
download_and_verify("$(cache_prefix)http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
47+
deps("housing.data"),
48+
"baadf72995725d76efe787b664e1f083388c79ba21ef9a7990d87f774184735a")
49+
50+
#@info "Download complete. Working on the files"
51+
path = deps()
52+
isfile(deps("housing.data")) && touch(joinpath(path, "tempfile.data"))
53+
open(joinpath(path, "tempfile.data"), "a") do fout
54+
open(deps("housing.data"), "r") do fin
55+
for line in eachline(fin)
56+
line = replace(lstrip(line), r" +" => s",")
57+
println(fout, line)
58+
end
59+
end
60+
end
61+
mv(joinpath(path, "tempfile.data"), deps("housing.data"), force=true)
62+
end
63+
64+
"""
65+
Gets the targets for the Boston housing dataset, a 506 element array listing the targets for each example
66+
```julia
67+
julia> using Flux
68+
julia> target = Flux.Data.Housing.targets()
69+
julia> summary(target)
70+
506×1 Array{Float64,2}
71+
julia> target[1]
72+
24.0
73+
"""
74+
function targets()
75+
deprecation_message()
76+
load()
77+
housing = readdlm(deps("housing.data"), ',')
78+
reshape(Vector{Float64}(housing[1:end,end]), (506, 1))
79+
end
80+
81+
82+
"""
83+
Gets the names of the features provided in the dataset
84+
"""
85+
function feature_names()
86+
["crim","zn","indus","chas","nox","rm","age","dis","rad","tax","ptratio","b","lstat"]
87+
end
88+
89+
90+
"""
91+
Gets the features of the Boston Housing Dataset. This is a 506x13 Matrix of Float64 datatypes.
92+
The values are in the order ["crim","zn","indus","chas","nox","rm","age","dis","rad","tax","ptratio","b","lstat"].
93+
It has 506 examples.
94+
```julia
95+
julia> using Flux
96+
julia> features = Flux.Data.Housing.features()
97+
julia> summary(features)
98+
506×13 Array{Float64,2}
99+
julia> features[1, :]
100+
13-element Array{Float64,1}:
101+
0.00632
102+
18.0
103+
2.31
104+
0.0
105+
0.538
106+
107+
296.0
108+
15.3
109+
396.9
110+
4.98
111+
"""
112+
function features()
113+
deprecation_message()
114+
load()
115+
housing = readdlm(deps("housing.data"), ',')
116+
Matrix{Float64}(housing[1:end, 1:13])
117+
end
118+
119+
120+
end

0 commit comments

Comments
 (0)